diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/GlobalISel')
56 files changed, 6783 insertions, 3860 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll index e1ef3f9be0a5..aa38c63dc9dc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -99,15 +99,13 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) { ; GCN-LABEL: v_andn2_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_not_b32_e32 v1, v1 -; GCN-NEXT: v_and_b32_e32 v0, v0, v1 +; GCN-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_andn2_i32: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 @@ -117,14 +115,12 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) { define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) { ; GCN-LABEL: v_andn2_i32_sv: ; GCN: ; %bb.0: -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, s2, v0 +; GCN-NEXT: v_bfi_b32 v0, v0, 0, s2 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_andn2_i32_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, 0, s2 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 @@ -135,14 +131,12 @@ define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) { define amdgpu_ps float @v_andn2_i32_vs(i32 %src0, i32 inreg %src1) { ; GCN-LABEL: v_andn2_i32_vs: ; GCN: ; %bb.0: -; GCN-NEXT: s_not_b32 s0, s2 -; GCN-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-NEXT: v_bfi_b32 v0, s2, 0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_andn2_i32_vs: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_not_b32 s0, s2 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX10PLUS-NEXT: v_bfi_b32 v0, s2, 0, v0 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 @@ -247,19 +241,15 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) { ; GCN-LABEL: v_andn2_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_not_b32_e32 v2, v2 -; GCN-NEXT: v_not_b32_e32 v3, v3 -; GCN-NEXT: v_and_b32_e32 v0, v0, v2 -; GCN-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN-NEXT: v_bfi_b32 v0, v2, 0, v0 +; GCN-NEXT: v_bfi_b32 v1, v3, 0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_andn2_i64: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_not_b32_e32 v2, v2 -; GFX10PLUS-NEXT: v_not_b32_e32 v3, v3 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX10PLUS-NEXT: v_bfi_b32 v0, v2, 0, v0 +; GFX10PLUS-NEXT: v_bfi_b32 v1, v3, 0, v1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i64 %src1, -1 %and = and i64 %src0, %not.src1 @@ -269,18 +259,14 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) { define amdgpu_ps <2 x float> @v_andn2_i64_sv(i64 inreg %src0, i64 %src1) { ; GCN-LABEL: v_andn2_i64_sv: ; GCN: ; %bb.0: -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: v_not_b32_e32 v1, v1 -; GCN-NEXT: v_and_b32_e32 v0, s2, v0 -; GCN-NEXT: v_and_b32_e32 v1, s3, v1 +; GCN-NEXT: v_bfi_b32 v0, v0, 0, s2 +; GCN-NEXT: v_bfi_b32 v1, v1, 0, s3 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_andn2_i64_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0 -; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10PLUS-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, 0, s2 +; GFX10PLUS-NEXT: v_bfi_b32 v1, v1, 0, s3 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %and = and i64 %src0, %not.src1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index e6e98fb6edf2..206011adf021 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3202,7 +3202,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 0 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 @@ -4206,7 +4206,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 0 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 @@ -4560,7 +4560,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 0 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 481a2540eacb..7e297f46a780 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -73,7 +73,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -192,7 +192,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -311,7 +311,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -429,7 +429,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -547,7 +547,7 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -666,7 +666,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -785,7 +785,7 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -903,7 +903,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inr ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -1021,7 +1021,7 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -1140,7 +1140,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -1259,7 +1259,7 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -1377,7 +1377,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inr ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -1501,6 +1501,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1571,6 +1572,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1645,6 +1649,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1715,6 +1720,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1792,6 +1800,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -1902,6 +1911,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1947,6 +1959,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1987,6 +2000,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2031,6 +2047,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2107,6 +2124,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2190,6 +2208,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2418,6 +2439,7 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index 2226fd20fb77..302b2395642d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -7,12 +7,215 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: ; implicit-def: $vgpr1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: s_load_dword s3, s[4:5], 0x2 +; CI-NEXT: s_mov_b32 s4, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s2| +; CI-NEXT: v_cvt_f32_f16_e64 v0, |s3| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v0 +; CI-NEXT: s_cbranch_vccz .LBB0_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_and_b32 s4, s2, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v0 +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CI-NEXT: s_mov_b32 s4, 0 +; CI-NEXT: .LBB0_2: ; %Flow18 +; CI-NEXT: s_xor_b32 s4, s4, 1 +; CI-NEXT: s_cmp_lg_u32 s4, 0 +; CI-NEXT: s_cbranch_scc1 .LBB0_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e32 v1, v2 +; CI-NEXT: v_ldexp_f32_e64 v4, v1, 11 +; CI-NEXT: v_frexp_mant_f32_e32 v1, v0 +; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1 +; CI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5 +; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; CI-NEXT: v_rcp_f32_e32 v8, v3 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; CI-NEXT: v_fma_f32 v8, v9, v8, v8 +; CI-NEXT: v_mul_f32_e32 v9, v7, v8 +; CI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; CI-NEXT: v_fma_f32 v9, v10, v8, v9 +; CI-NEXT: v_fma_f32 v3, -v3, v9, v7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 +; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB0_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; CI-NEXT: .LBB0_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: v_mul_f32_e32 v4, v5, v3 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v6, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 +; CI-NEXT: v_ldexp_f32_e64 v4, v4, 11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2 +; CI-NEXT: s_cbranch_vccnz .LBB0_5 +; CI-NEXT: s_branch .LBB0_7 +; CI-NEXT: .LBB0_6: +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: .LBB0_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2 +; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 +; CI-NEXT: v_mul_f32_e32 v3, v2, v3 +; CI-NEXT: v_rndne_f32_e32 v3, v3 +; CI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; CI-NEXT: v_add_f32_e32 v1, v2, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: s_and_b32 s4, s2, 0x8000 +; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CI-NEXT: v_or_b32_e32 v1, s4, v0 +; CI-NEXT: .LBB0_8: ; %Flow19 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, 0 +; CI-NEXT: s_and_b32 s2, s2, 0x7fff +; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 +; CI-NEXT: s_cselect_b32 s2, 1, 0 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; CI-NEXT: s_and_b32 s2, 1, s2 +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: frem_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s0, s[10:11], 0x0 +; VI-NEXT: s_load_dword s1, s[2:3], 0x8 +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e64 v1, |s0| +; VI-NEXT: v_cvt_f32_f16_e64 v0, |s1| +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v1, v0 +; VI-NEXT: s_cbranch_vccz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_and_b32 s2, s0, 0x8000 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB0_2: ; %Flow18 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB0_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e32 v2, v1 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v5, v1 +; VI-NEXT: v_frexp_mant_f32_e32 v1, v0 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v5 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6 +; VI-NEXT: v_ldexp_f32 v1, v1, 1 +; VI-NEXT: v_ldexp_f32 v4, v2, 11 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v3, v0 +; VI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 +; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v8, v3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; VI-NEXT: v_fma_f32 v8, v9, v8, v8 +; VI-NEXT: v_mul_f32_e32 v9, v7, v8 +; VI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; VI-NEXT: v_fma_f32 v9, v10, v8, v9 +; VI-NEXT: v_fma_f32 v3, -v3, v9, v7 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 +; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB0_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: .LBB0_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: v_mul_f32_e32 v4, v5, v3 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v6, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 +; VI-NEXT: v_ldexp_f32 v4, v4, 11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2 +; VI-NEXT: s_cbranch_vccnz .LBB0_5 +; VI-NEXT: s_branch .LBB0_7 +; VI-NEXT: .LBB0_6: +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: .LBB0_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2 +; VI-NEXT: v_ldexp_f32 v2, v5, v2 +; VI-NEXT: v_mul_f32_e32 v3, v2, v3 +; VI-NEXT: v_rndne_f32_e32 v3, v3 +; VI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; VI-NEXT: v_add_f32_e32 v1, v2, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: s_and_b32 s2, s0, 0x8000 +; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v2, s2, v0 +; VI-NEXT: .LBB0_8: ; %Flow19 +; VI-NEXT: v_mov_b32_e32 v0, 0x7c00 +; VI-NEXT: v_cmp_nlg_f16_e64 vcc, s1, 0 +; VI-NEXT: v_cmp_nge_f16_e64 s[0:1], |s0|, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc +; VI-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm + %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 + %r0 = load half, ptr addrspace(1) %in1, align 4 + %r1 = load half, ptr addrspace(1) %gep2, align 4 + %r2 = frem half %r0, %r1 + store half %r2, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; CI-LABEL: fast_frem_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s6, s[2:3], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; CI-NEXT: v_rcp_f32_e32 v4, v2 @@ -27,15 +230,21 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; CI-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; CI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_trunc_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_fma_f32 v0, v0, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: frem_f16: +; VI-LABEL: fast_frem_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 @@ -65,33 +274,51 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 %r1 = load half, ptr addrspace(1) %gep2, align 4 - %r2 = frem half %r0, %r1 + %r2 = frem fast half %r0, %r1 store half %r2, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { -; CI-LABEL: fast_frem_f16: +define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { +; CI-LABEL: unsafe_frem_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: s_load_dword s3, s[4:5], 0x2 +; CI-NEXT: s_load_dword s6, s[2:3], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 +; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; CI-NEXT: v_rcp_f32_e32 v4, v2 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; CI-NEXT: v_fma_f32 v4, v5, v4, v4 +; CI-NEXT: v_mul_f32_e32 v5, v3, v4 +; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 +; CI-NEXT: v_fma_f32 v5, v6, v4, v5 +; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_rcp_f32_e32 v2, v1 -; CI-NEXT: v_mul_f32_e32 v2, v0, v2 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; CI-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; CI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_trunc_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_fma_f32 v0, v0, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: fast_frem_f16: +; VI-LABEL: unsafe_frem_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 @@ -99,11 +326,21 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: s_load_dword s3, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_rcp_f16_e32 v0, s3 -; VI-NEXT: v_mul_f16_e32 v0, s2, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_rcp_f32_e32 v3, v2 +; VI-NEXT: v_mul_f32_e32 v4, v0, v3 +; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 +; VI-NEXT: v_mac_f32_e32 v4, v5, v3 +; VI-NEXT: v_mad_f32 v0, -v2, v4, v0 +; VI-NEXT: v_mul_f32_e32 v0, v0, v3 +; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; VI-NEXT: v_add_f32_e32 v0, v0, v4 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2 ; VI-NEXT: v_trunc_f16_e32 v0, v0 -; VI-NEXT: v_fma_f16 v2, -v0, s3, v1 +; VI-NEXT: v_fma_f16 v2, -v0, v1, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -111,59 +348,209 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 %r1 = load half, ptr addrspace(1) %gep2, align 4 - %r2 = frem fast half %r0, %r1 + %r2 = frem afn half %r0, %r1 store half %r2, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { -; CI-LABEL: unsafe_frem_f16: +define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; CI-LABEL: frem_f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: s_load_dword s3, s[4:5], 0x2 +; CI-NEXT: s_load_dword s3, s[4:5], 0x4 +; CI-NEXT: s_mov_b32 s4, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0| +; CI-NEXT: ; implicit-def: $vgpr0 +; CI-NEXT: s_cbranch_vccz .LBB3_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_and_b32 s4, s2, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1| +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-NEXT: s_mov_b32 s4, 0 +; CI-NEXT: .LBB3_2: ; %Flow16 +; CI-NEXT: s_xor_b32 s4, s4, 1 +; CI-NEXT: s_cmp_lg_u32 s4, 0 +; CI-NEXT: s_cbranch_scc1 .LBB3_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e64 v1, |s3| +; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1 +; CI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v0, |s2| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3| +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5 +; CI-NEXT: v_ldexp_f32_e64 v4, v0, 12 +; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; CI-NEXT: v_rcp_f32_e32 v8, v3 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; CI-NEXT: v_fma_f32 v8, v9, v8, v8 +; CI-NEXT: v_mul_f32_e32 v9, v7, v8 +; CI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; CI-NEXT: v_fma_f32 v9, v10, v8, v9 +; CI-NEXT: v_fma_f32 v3, -v3, v9, v7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 +; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB3_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; CI-NEXT: .LBB3_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: v_mul_f32_e32 v4, v5, v3 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v6, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, -12, v2 +; CI-NEXT: v_ldexp_f32_e64 v4, v4, 12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2 +; CI-NEXT: s_cbranch_vccnz .LBB3_5 +; CI-NEXT: s_branch .LBB3_7 +; CI-NEXT: .LBB3_6: +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: .LBB3_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 +; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 +; CI-NEXT: v_mul_f32_e32 v3, v2, v3 +; CI-NEXT: v_rndne_f32_e32 v3, v3 +; CI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; CI-NEXT: v_add_f32_e32 v1, v2, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; CI-NEXT: s_and_b32 s4, s2, 0x80000000 +; CI-NEXT: v_or_b32_e32 v0, s4, v0 +; CI-NEXT: .LBB3_8: ; %Flow17 +; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s3, 0 +; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; CI-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s2|, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_rcp_f32_e32 v2, v1 -; CI-NEXT: v_mul_f32_e32 v2, v0, v2 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: unsafe_frem_f16: +; VI-LABEL: frem_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_load_dword s3, s[4:5], 0x8 +; VI-NEXT: s_load_dword s3, s[4:5], 0x10 +; VI-NEXT: s_mov_b32 s4, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_rcp_f16_e32 v0, s3 -; VI-NEXT: v_mul_f16_e32 v0, s2, v0 -; VI-NEXT: v_trunc_f16_e32 v0, v0 -; VI-NEXT: v_fma_f16 v2, -v0, s3, v1 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0| +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: s_cbranch_vccz .LBB3_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_and_b32 s4, s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1| +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: .LBB3_2: ; %Flow16 +; VI-NEXT: s_xor_b32 s4, s4, 1 +; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cbranch_scc1 .LBB3_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e64 v1, |s3| +; VI-NEXT: v_ldexp_f32 v1, v1, 1 +; VI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v0, |s2| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3| +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5 +; VI-NEXT: v_ldexp_f32 v4, v0, 12 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v8, v3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; VI-NEXT: v_fma_f32 v8, v9, v8, v8 +; VI-NEXT: v_mul_f32_e32 v9, v7, v8 +; VI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; VI-NEXT: v_fma_f32 v9, v10, v8, v9 +; VI-NEXT: v_fma_f32 v3, -v3, v9, v7 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 +; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB3_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: .LBB3_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: v_mul_f32_e32 v4, v5, v3 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v6, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, -12, v2 +; VI-NEXT: v_ldexp_f32 v4, v4, 12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2 +; VI-NEXT: s_cbranch_vccnz .LBB3_5 +; VI-NEXT: s_branch .LBB3_7 +; VI-NEXT: .LBB3_6: +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: .LBB3_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 +; VI-NEXT: v_ldexp_f32 v2, v5, v2 +; VI-NEXT: v_mul_f32_e32 v3, v2, v3 +; VI-NEXT: v_rndne_f32_e32 v3, v3 +; VI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; VI-NEXT: v_add_f32_e32 v1, v2, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; VI-NEXT: s_and_b32 s4, s2, 0x80000000 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: .LBB3_8: ; %Flow17 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; VI-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s2|, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm - %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 - %r0 = load half, ptr addrspace(1) %in1, align 4 - %r1 = load half, ptr addrspace(1) %gep2, align 4 - %r2 = frem afn half %r0, %r1 - store half %r2, ptr addrspace(1) %out, align 4 + %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 + %r0 = load float, ptr addrspace(1) %in1, align 4 + %r1 = load float, ptr addrspace(1) %gep2, align 4 + %r2 = frem float %r0, %r1 + store float %r2, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { -; CI-LABEL: frem_f32: +define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; CI-LABEL: fast_frem_f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd @@ -192,7 +579,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: frem_f32: +; VI-LABEL: fast_frem_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 @@ -223,43 +610,65 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 %r1 = load float, ptr addrspace(1) %gep2, align 4 - %r2 = frem float %r0, %r1 + %r2 = frem fast float %r0, %r1 store float %r2, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { -; CI-LABEL: fast_frem_f32: +define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { +; CI-LABEL: unsafe_frem_f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: s_load_dword s3, s[4:5], 0x4 +; CI-NEXT: s_load_dword s6, s[2:3], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s2 -; CI-NEXT: v_rcp_f32_e32 v0, s3 -; CI-NEXT: v_mul_f32_e32 v0, s2, v0 -; CI-NEXT: v_trunc_f32_e32 v0, v0 -; CI-NEXT: v_fma_f32 v0, -v0, s3, v1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s6 +; CI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 +; CI-NEXT: v_rcp_f32_e32 v3, v1 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; CI-NEXT: v_fma_f32 v3, v4, v3, v3 +; CI-NEXT: v_mul_f32_e32 v4, v2, v3 +; CI-NEXT: v_fma_f32 v5, -v1, v4, v2 +; CI-NEXT: v_fma_f32 v4, v5, v3, v4 +; CI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s6 +; CI-NEXT: v_trunc_f32_e32 v1, v1 +; CI-NEXT: v_fma_f32 v0, -v1, v0, s6 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: fast_frem_f32: +; VI-LABEL: unsafe_frem_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[2:3], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_rcp_f32_e32 v0, s3 -; VI-NEXT: v_mul_f32_e32 v0, s2, v0 -; VI-NEXT: v_trunc_f32_e32 v0, v0 -; VI-NEXT: v_fma_f32 v2, -v0, s3, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s6 +; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 +; VI-NEXT: v_rcp_f32_e32 v3, v1 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; VI-NEXT: v_fma_f32 v3, v4, v3, v3 +; VI-NEXT: v_mul_f32_e32 v4, v2, v3 +; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 +; VI-NEXT: v_fma_f32 v4, v5, v3, v4 +; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s6 +; VI-NEXT: v_trunc_f32_e32 v1, v1 +; VI-NEXT: v_fma_f32 v2, -v1, v0, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -267,57 +676,238 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 %r1 = load float, ptr addrspace(1) %gep2, align 4 - %r2 = frem fast float %r0, %r1 + %r2 = frem afn float %r0, %r1 store float %r2, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { -; CI-LABEL: unsafe_frem_f32: +define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; CI-LABEL: frem_f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s6, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: s_load_dword s3, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s2 -; CI-NEXT: v_rcp_f32_e32 v0, s3 -; CI-NEXT: v_mul_f32_e32 v0, s2, v0 -; CI-NEXT: v_trunc_f32_e32 v0, v0 -; CI-NEXT: v_fma_f32 v0, -v0, s3, v1 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]| +; CI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CI-NEXT: s_cbranch_vccz .LBB6_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]| +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_brev_b32 s7, 1 +; CI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7] +; CI-NEXT: v_mov_b32_e32 v0, s6 +; CI-NEXT: v_mov_b32_e32 v1, s7 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: .LBB6_2: ; %Flow16 +; CI-NEXT: s_xor_b32 s6, s6, 1 +; CI-NEXT: s_cmp_lg_u32 s6, 0 +; CI-NEXT: s_cbranch_scc1 .LBB6_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[2:3]| +; CI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[2:3]| +; CI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[4:5]| +; CI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26 +; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]| +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v6 +; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v7 +; CI-NEXT: v_sub_i32_e32 v9, vcc, v2, v8 +; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1 +; CI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], 1.0 +; CI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0 +; CI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3] +; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; CI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15] +; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13] +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9 +; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB6_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v6, vcc, 26, v6 +; CI-NEXT: v_sub_i32_e32 v9, vcc, v6, v7 +; CI-NEXT: .LBB6_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mov_b32_e32 v6, v4 +; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3] +; CI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5] +; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; CI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1] +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; CI-NEXT: v_add_i32_e32 v9, vcc, 0xffffffe6, v9 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9 +; CI-NEXT: s_cbranch_vccnz .LBB6_5 +; CI-NEXT: s_branch .LBB6_7 +; CI-NEXT: .LBB6_6: +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mov_b32_e32 v6, v4 +; CI-NEXT: .LBB6_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe7, v9 +; CI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_brev_b32 s7, 1 +; CI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7] +; CI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3] +; CI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3] +; CI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 +; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CI-NEXT: v_or_b32_e32 v0, s6, v0 +; CI-NEXT: v_or_b32_e32 v1, s7, v1 +; CI-NEXT: .LBB6_8: ; %Flow17 +; CI-NEXT: v_cmp_nlg_f64_e64 vcc, s[4:5], 0 +; CI-NEXT: v_mov_b32_e32 v2, 0x7ff80000 +; CI-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc +; CI-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: v_mov_b32_e32 v1, 0x7ff00000 +; CI-NEXT: v_cmp_nge_f64_e64 vcc, |s[2:3]|, v[0:1] ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: unsafe_frem_f32: +; VI-LABEL: frem_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s6, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_rcp_f32_e32 v0, s3 -; VI-NEXT: v_mul_f32_e32 v0, s2, v0 -; VI-NEXT: v_trunc_f32_e32 v0, v0 -; VI-NEXT: v_fma_f32 v2, -v0, s3, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]| +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_cbranch_vccz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]| +; VI-NEXT: s_mov_b32 s6, 0 +; VI-NEXT: s_brev_b32 s7, 1 +; VI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: s_mov_b32 s6, 0 +; VI-NEXT: .LBB6_2: ; %Flow16 +; VI-NEXT: s_xor_b32 s6, s6, 1 +; VI-NEXT: s_cmp_lg_u32 s6, 0 +; VI-NEXT: s_cbranch_scc1 .LBB6_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[2:3]| +; VI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[2:3]| +; VI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[4:5]| +; VI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26 +; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]| +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v6 +; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v7 +; VI-NEXT: v_sub_u32_e32 v9, vcc, v2, v8 +; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1 +; VI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], 1.0 +; VI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0 +; VI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3] +; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; VI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15] +; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13] +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9 +; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB6_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v6, vcc, 26, v6 +; VI-NEXT: v_sub_u32_e32 v9, vcc, v6, v7 +; VI-NEXT: .LBB6_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3] +; VI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5] +; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; VI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1] +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0xffffffe6, v9 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9 +; VI-NEXT: s_cbranch_vccnz .LBB6_5 +; VI-NEXT: s_branch .LBB6_7 +; VI-NEXT: .LBB6_6: +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: .LBB6_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe7, v9 +; VI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4 +; VI-NEXT: s_mov_b32 s6, 0 +; VI-NEXT: s_brev_b32 s7, 1 +; VI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7] +; VI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3] +; VI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 +; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; VI-NEXT: v_or_b32_e32 v0, s6, v0 +; VI-NEXT: v_or_b32_e32 v1, s7, v1 +; VI-NEXT: .LBB6_8: ; %Flow17 +; VI-NEXT: v_cmp_nlg_f64_e64 vcc, s[4:5], 0 +; VI-NEXT: v_mov_b32_e32 v2, 0x7ff80000 +; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0x7ff00000 +; VI-NEXT: v_cmp_nge_f64_e64 vcc, |s[2:3]|, v[0:1] +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm - %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 - %r0 = load float, ptr addrspace(1) %in1, align 4 - %r1 = load float, ptr addrspace(1) %gep2, align 4 - %r2 = frem afn float %r0, %r1 - store float %r2, ptr addrspace(1) %out, align 4 + %r0 = load double, ptr addrspace(1) %in1, align 8 + %r1 = load double, ptr addrspace(1) %in2, align 8 + %r2 = frem double %r0, %r1 + store double %r2, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { -; CI-LABEL: frem_f64: +define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; CI-LABEL: fast_frem_f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd @@ -345,7 +935,7 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: frem_f64: +; VI-LABEL: fast_frem_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 @@ -374,63 +964,6 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: s_endpgm %r0 = load double, ptr addrspace(1) %in1, align 8 %r1 = load double, ptr addrspace(1) %in2, align 8 - %r2 = frem double %r0, %r1 - store double %r2, ptr addrspace(1) %out, align 8 - ret void -} - -define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { -; CI-LABEL: fast_frem_f64: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] -; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] -; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] -; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] -; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; CI-NEXT: s_endpgm -; -; VI-LABEL: fast_frem_f64: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] -; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] -; VI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] -; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] -; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] -; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm - %r0 = load double, ptr addrspace(1) %in1, align 8 - %r1 = load double, ptr addrspace(1) %in2, align 8 %r2 = frem fast double %r0, %r1 store double %r2, ptr addrspace(1) %out, align 8 ret void @@ -445,20 +978,23 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] -; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3] +; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3] +; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3] +; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] +; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3] ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] -; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] -; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] -; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; @@ -470,18 +1006,21 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] -; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] -; VI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] -; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] -; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] -; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3] +; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3] +; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3] +; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] +; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -497,102 +1036,372 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xd +; CI-NEXT: ; implicit-def: $vgpr0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: s_load_dword s3, s[4:5], 0x4 +; CI-NEXT: s_load_dword s0, s[10:11], 0x0 +; CI-NEXT: s_load_dword s1, s[2:3], 0x4 +; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; CI-NEXT: s_lshr_b32 s4, s2, 16 -; CI-NEXT: s_lshr_b32 s5, s3, 16 -; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 -; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; CI-NEXT: v_rcp_f32_e32 v4, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s0| +; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 +; CI-NEXT: s_cbranch_vccz .LBB9_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_and_b32 s2, s0, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: .LBB9_2: ; %Flow57 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 +; CI-NEXT: s_cbranch_scc1 .LBB9_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e32 v3, v1 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 +; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1 +; CI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 +; CI-NEXT: v_frexp_mant_f32_e32 v0, v2 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5 +; CI-NEXT: v_ldexp_f32_e64 v4, v0, 11 +; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; CI-NEXT: v_rcp_f32_e32 v8, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v3, v4 -; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 +; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; CI-NEXT: v_fma_f32 v8, v9, v8, v8 +; CI-NEXT: v_mul_f32_e32 v9, v7, v8 +; CI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; CI-NEXT: v_fma_f32 v9, v10, v8, v9 +; CI-NEXT: v_fma_f32 v3, -v3, v9, v7 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 +; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB9_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; CI-NEXT: .LBB9_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: v_mul_f32_e32 v4, v5, v3 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v6, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 +; CI-NEXT: v_ldexp_f32_e64 v4, v4, 11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2 +; CI-NEXT: s_cbranch_vccnz .LBB9_5 +; CI-NEXT: s_branch .LBB9_7 +; CI-NEXT: .LBB9_6: +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: .LBB9_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2 +; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 +; CI-NEXT: v_mul_f32_e32 v3, v2, v3 +; CI-NEXT: v_rndne_f32_e32 v3, v3 +; CI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; CI-NEXT: v_add_f32_e32 v1, v2, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, v1 -; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1 -; CI-NEXT: v_rcp_f32_e32 v5, v3 +; CI-NEXT: s_and_b32 s2, s0, 0x8000 +; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CI-NEXT: v_or_b32_e32 v0, s2, v0 +; CI-NEXT: .LBB9_8: ; %Flow58 +; CI-NEXT: s_lshr_b32 s2, s0, 16 +; CI-NEXT: s_lshr_b32 s3, s1, 16 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |s2| +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s3| +; CI-NEXT: s_mov_b32 s4, 1 +; CI-NEXT: ; implicit-def: $vgpr1 +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; CI-NEXT: s_cbranch_vccz .LBB9_10 +; CI-NEXT: ; %bb.9: ; %frem.else20 +; CI-NEXT: s_and_b32 s4, s2, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CI-NEXT: s_mov_b32 s4, 0 +; CI-NEXT: .LBB9_10: ; %Flow53 +; CI-NEXT: s_xor_b32 s4, s4, 1 +; CI-NEXT: s_cmp_lg_u32 s4, 0 +; CI-NEXT: s_cbranch_scc1 .LBB9_16 +; CI-NEXT: ; %bb.11: ; %frem.compute19 +; CI-NEXT: v_frexp_mant_f32_e32 v4, v2 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 +; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1 +; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, 1.0 +; CI-NEXT: v_frexp_mant_f32_e32 v1, v3 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3 +; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6 +; CI-NEXT: v_ldexp_f32_e64 v5, v1, 11 +; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1 +; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; CI-NEXT: v_rcp_f32_e32 v9, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; CI-NEXT: v_fma_f32 v5, v6, v5, v5 -; CI-NEXT: v_mul_f32_e32 v6, v4, v5 -; CI-NEXT: v_fma_f32 v7, -v3, v6, v4 -; CI-NEXT: v_fma_f32 v6, v7, v5, v6 -; CI-NEXT: v_fma_f32 v3, -v3, v6, v4 +; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; CI-NEXT: v_fma_f32 v9, v10, v9, v9 +; CI-NEXT: v_mul_f32_e32 v10, v8, v9 +; CI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; CI-NEXT: v_fma_f32 v10, v11, v9, v10 +; CI-NEXT: v_fma_f32 v4, -v4, v10, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1 -; CI-NEXT: v_trunc_f32_e32 v3, v3 -; CI-NEXT: v_fma_f32 v1, -v3, v2, v1 +; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 +; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB9_14 +; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; CI-NEXT: .LBB9_13: ; %frem.loop_body27 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: v_mul_f32_e32 v5, v6, v4 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v7, v5, v2 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3 +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3 +; CI-NEXT: s_cbranch_vccnz .LBB9_13 +; CI-NEXT: s_branch .LBB9_15 +; CI-NEXT: .LBB9_14: +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3 +; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 +; CI-NEXT: v_mul_f32_e32 v4, v3, v4 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; CI-NEXT: v_add_f32_e32 v2, v3, v2 +; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: s_and_b32 s4, s2, 0x8000 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_or_b32_e32 v1, s4, v1 +; CI-NEXT: .LBB9_16: ; %Flow54 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, 0 +; CI-NEXT: s_and_b32 s0, s0, 0x7fff +; CI-NEXT: s_cmpk_lg_i32 s0, 0x7c00 +; CI-NEXT: s_cselect_b32 s4, 1, 0 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; CI-NEXT: s_and_b32 s2, s2, 0x7fff +; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 +; CI-NEXT: s_cselect_b32 s2, 1, 0 +; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v2, v3 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_mov_b32_e32 v2, 0x7e00 +; CI-NEXT: s_and_b32 s3, 1, s4 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s3 +; CI-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; CI-NEXT: s_and_b32 s0, 1, s2 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 +; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 +; VI-NEXT: s_load_dword s0, s[10:11], 0x0 +; VI-NEXT: s_load_dword s1, s[2:3], 0x10 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: v_rcp_f32_e32 v3, v2 -; VI-NEXT: v_mul_f32_e32 v4, v0, v3 -; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 -; VI-NEXT: v_mac_f32_e32 v4, v5, v3 -; VI-NEXT: v_mad_f32 v0, -v2, v4, v0 -; VI-NEXT: v_mul_f32_e32 v0, v0, v3 -; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; VI-NEXT: v_add_f32_e32 v0, v0, v4 +; VI-NEXT: v_cvt_f32_f16_e64 v2, |s0| +; VI-NEXT: v_cvt_f32_f16_e64 v1, |s1| +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 +; VI-NEXT: s_cbranch_vccz .LBB9_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_and_b32 s2, s0, 0x8000 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB9_2: ; %Flow57 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB9_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e32 v3, v1 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 +; VI-NEXT: v_ldexp_f32 v1, v3, 1 +; VI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 +; VI-NEXT: v_frexp_mant_f32_e32 v0, v2 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5 +; VI-NEXT: v_ldexp_f32 v4, v0, 11 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v8, v3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; VI-NEXT: v_fma_f32 v8, v9, v8, v8 +; VI-NEXT: v_mul_f32_e32 v9, v7, v8 +; VI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; VI-NEXT: v_fma_f32 v9, v10, v8, v9 +; VI-NEXT: v_fma_f32 v3, -v3, v9, v7 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 +; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB9_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: .LBB9_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: v_mul_f32_e32 v4, v5, v3 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v6, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 +; VI-NEXT: v_ldexp_f32 v4, v4, 11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2 +; VI-NEXT: s_cbranch_vccnz .LBB9_5 +; VI-NEXT: s_branch .LBB9_7 +; VI-NEXT: .LBB9_6: +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: .LBB9_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2 +; VI-NEXT: v_ldexp_f32 v2, v5, v2 +; VI-NEXT: v_mul_f32_e32 v3, v2, v3 +; VI-NEXT: v_rndne_f32_e32 v3, v3 +; VI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; VI-NEXT: v_add_f32_e32 v1, v2, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; VI-NEXT: v_ldexp_f32 v0, v1, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2 -; VI-NEXT: v_trunc_f16_e32 v0, v0 -; VI-NEXT: v_fma_f16 v0, -v0, v1, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; VI-NEXT: v_rcp_f32_e32 v4, v3 -; VI-NEXT: v_mul_f32_e32 v5, v1, v4 -; VI-NEXT: v_mad_f32 v6, -v3, v5, v1 -; VI-NEXT: v_mac_f32_e32 v5, v6, v4 -; VI-NEXT: v_mad_f32 v1, -v3, v5, v1 -; VI-NEXT: v_mul_f32_e32 v1, v1, v4 -; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; VI-NEXT: v_add_f32_e32 v1, v1, v5 +; VI-NEXT: s_and_b32 s2, s0, 0x8000 +; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v0, s2, v0 +; VI-NEXT: .LBB9_8: ; %Flow58 +; VI-NEXT: s_lshr_b32 s4, s0, 16 +; VI-NEXT: s_lshr_b32 s2, s1, 16 +; VI-NEXT: v_cvt_f32_f16_e64 v3, |s4| +; VI-NEXT: v_cvt_f32_f16_e64 v2, |s2| +; VI-NEXT: s_mov_b32 s3, 1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; VI-NEXT: s_cbranch_vccz .LBB9_10 +; VI-NEXT: ; %bb.9: ; %frem.else20 +; VI-NEXT: s_and_b32 s3, s4, 0x8000 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: .LBB9_10: ; %Flow53 +; VI-NEXT: s_xor_b32 s3, s3, 1 +; VI-NEXT: s_cmp_lg_u32 s3, 0 +; VI-NEXT: s_cbranch_scc1 .LBB9_16 +; VI-NEXT: ; %bb.11: ; %frem.compute19 +; VI-NEXT: v_frexp_mant_f32_e32 v4, v2 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 +; VI-NEXT: v_ldexp_f32 v2, v4, 1 +; VI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0 +; VI-NEXT: v_frexp_mant_f32_e32 v1, v3 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6 +; VI-NEXT: v_ldexp_f32 v5, v1, 11 +; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; VI-NEXT: v_rcp_f32_e32 v9, v4 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; VI-NEXT: v_fma_f32 v9, v10, v9, v9 +; VI-NEXT: v_mul_f32_e32 v10, v8, v9 +; VI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; VI-NEXT: v_fma_f32 v10, v11, v9, v10 +; VI-NEXT: v_fma_f32 v4, -v4, v10, v8 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 +; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB9_14 +; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 +; VI-NEXT: .LBB9_13: ; %frem.loop_body27 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: v_mul_f32_e32 v5, v6, v4 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v7, v5, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3 +; VI-NEXT: v_ldexp_f32 v5, v5, 11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3 +; VI-NEXT: s_cbranch_vccnz .LBB9_13 +; VI-NEXT: s_branch .LBB9_15 +; VI-NEXT: .LBB9_14: +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3 +; VI-NEXT: v_ldexp_f32 v3, v6, v3 +; VI-NEXT: v_mul_f32_e32 v4, v3, v4 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; VI-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: v_ldexp_f32 v1, v2, v1 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s4 -; VI-NEXT: v_trunc_f16_e32 v1, v1 -; VI-NEXT: v_fma_f16 v1, -v1, v2, s4 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s3, s4, 0x8000 +; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v1, s3, v1 +; VI-NEXT: .LBB9_16: ; %Flow54 +; VI-NEXT: v_mov_b32_e32 v2, 0x7c00 +; VI-NEXT: v_cmp_nlg_f16_e64 vcc, s1, 0 +; VI-NEXT: v_cmp_nge_f16_e64 s[0:1], |s0|, v2 +; VI-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, 0 +; VI-NEXT: v_cmp_nge_f16_e64 s[4:5], |s4|, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_mov_b64 vcc, s[2:3] +; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_mov_b64 vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4 @@ -606,176 +1415,714 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; CI-NEXT: ; implicit-def: $vgpr0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; CI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; CI-NEXT: s_mov_b32 s0, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; CI-NEXT: s_lshr_b32 s8, s2, 16 -; CI-NEXT: s_lshr_b32 s9, s3, 16 -; CI-NEXT: s_lshr_b32 s10, s4, 16 -; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, v0 -; CI-NEXT: s_lshr_b32 s11, s5, 16 -; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; CI-NEXT: v_rcp_f32_e32 v4, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s4| +; CI-NEXT: v_cvt_f32_f16_e64 v1, |s2| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 +; CI-NEXT: s_cbranch_vccz .LBB10_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_and_b32 s0, s4, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v3, s4 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; CI-NEXT: s_mov_b32 s0, 0 +; CI-NEXT: .LBB10_2: ; %Flow135 +; CI-NEXT: s_xor_b32 s0, s0, 1 +; CI-NEXT: s_cmp_lg_u32 s0, 0 +; CI-NEXT: s_cbranch_scc1 .LBB10_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e32 v3, v1 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 +; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1 +; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, 1.0 +; CI-NEXT: v_frexp_mant_f32_e32 v0, v2 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5 +; CI-NEXT: v_ldexp_f32_e64 v4, v0, 11 +; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; CI-NEXT: v_rcp_f32_e32 v8, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v3, v4 -; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 +; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; CI-NEXT: v_fma_f32 v8, v9, v8, v8 +; CI-NEXT: v_mul_f32_e32 v9, v7, v8 +; CI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; CI-NEXT: v_fma_f32 v9, v10, v8, v9 +; CI-NEXT: v_fma_f32 v3, -v3, v9, v7 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s10 -; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 +; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; CI-NEXT: .LBB10_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: v_mul_f32_e32 v4, v5, v3 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v6, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 +; CI-NEXT: v_ldexp_f32_e64 v4, v4, 11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2 +; CI-NEXT: s_cbranch_vccnz .LBB10_5 +; CI-NEXT: s_branch .LBB10_7 +; CI-NEXT: .LBB10_6: +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: .LBB10_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2 +; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 +; CI-NEXT: v_mul_f32_e32 v3, v2, v3 +; CI-NEXT: v_rndne_f32_e32 v3, v3 +; CI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; CI-NEXT: v_add_f32_e32 v1, v2, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_div_scale_f32 v3, s[6:7], v2, v2, v1 -; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1 -; CI-NEXT: v_rcp_f32_e32 v5, v3 +; CI-NEXT: s_and_b32 s0, s4, 0x8000 +; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CI-NEXT: v_or_b32_e32 v0, s0, v0 +; CI-NEXT: .LBB10_8: ; %Flow136 +; CI-NEXT: s_lshr_b32 s6, s4, 16 +; CI-NEXT: s_lshr_b32 s0, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |s6| +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s0| +; CI-NEXT: s_mov_b32 s1, 1 +; CI-NEXT: ; implicit-def: $vgpr1 +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; CI-NEXT: s_cbranch_vccz .LBB10_10 +; CI-NEXT: ; %bb.9: ; %frem.else20 +; CI-NEXT: s_and_b32 s1, s6, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v4, s6 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CI-NEXT: s_mov_b32 s1, 0 +; CI-NEXT: .LBB10_10: ; %Flow131 +; CI-NEXT: s_xor_b32 s1, s1, 1 +; CI-NEXT: s_cmp_lg_u32 s1, 0 +; CI-NEXT: s_cbranch_scc1 .LBB10_16 +; CI-NEXT: ; %bb.11: ; %frem.compute19 +; CI-NEXT: v_frexp_mant_f32_e32 v4, v2 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 +; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1 +; CI-NEXT: v_div_scale_f32 v4, s[10:11], v2, v2, 1.0 +; CI-NEXT: v_frexp_mant_f32_e32 v1, v3 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3 +; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6 +; CI-NEXT: v_ldexp_f32_e64 v5, v1, 11 +; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1 +; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; CI-NEXT: v_rcp_f32_e32 v9, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; CI-NEXT: v_fma_f32 v5, v6, v5, v5 -; CI-NEXT: v_mul_f32_e32 v6, v4, v5 -; CI-NEXT: v_fma_f32 v7, -v3, v6, v4 -; CI-NEXT: v_fma_f32 v6, v7, v5, v6 -; CI-NEXT: v_fma_f32 v3, -v3, v6, v4 +; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; CI-NEXT: v_fma_f32 v9, v10, v9, v9 +; CI-NEXT: v_mul_f32_e32 v10, v8, v9 +; CI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; CI-NEXT: v_fma_f32 v10, v11, v9, v10 +; CI-NEXT: v_fma_f32 v4, -v4, v10, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1 -; CI-NEXT: v_trunc_f32_e32 v3, v3 -; CI-NEXT: v_fma_f32 v1, -v3, v2, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 +; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_14 +; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; CI-NEXT: .LBB10_13: ; %frem.loop_body27 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: v_mul_f32_e32 v5, v6, v4 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v7, v5, v2 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3 +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3 +; CI-NEXT: s_cbranch_vccnz .LBB10_13 +; CI-NEXT: s_branch .LBB10_15 +; CI-NEXT: .LBB10_14: +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3 +; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 +; CI-NEXT: v_mul_f32_e32 v4, v3, v4 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; CI-NEXT: v_add_f32_e32 v2, v3, v2 +; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, v2 -; CI-NEXT: v_div_scale_f32 v5, vcc, v2, v3, v2 -; CI-NEXT: v_rcp_f32_e32 v6, v4 +; CI-NEXT: s_and_b32 s1, s6, 0x8000 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_or_b32_e32 v1, s1, v1 +; CI-NEXT: .LBB10_16: ; %Flow132 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |s5| +; CI-NEXT: v_cvt_f32_f16_e64 v3, |s3| +; CI-NEXT: s_mov_b32 s1, 1 +; CI-NEXT: ; implicit-def: $vgpr2 +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 +; CI-NEXT: s_cbranch_vccz .LBB10_18 +; CI-NEXT: ; %bb.17: ; %frem.else53 +; CI-NEXT: s_and_b32 s1, s5, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; CI-NEXT: s_mov_b32 s1, 0 +; CI-NEXT: .LBB10_18: ; %Flow127 +; CI-NEXT: s_xor_b32 s1, s1, 1 +; CI-NEXT: s_cmp_lg_u32 s1, 0 +; CI-NEXT: s_cbranch_scc1 .LBB10_24 +; CI-NEXT: ; %bb.19: ; %frem.compute52 +; CI-NEXT: v_frexp_mant_f32_e32 v5, v3 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 +; CI-NEXT: v_ldexp_f32_e64 v3, v5, 1 +; CI-NEXT: v_div_scale_f32 v5, s[10:11], v3, v3, 1.0 +; CI-NEXT: v_frexp_mant_f32_e32 v2, v4 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 +; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v7 +; CI-NEXT: v_ldexp_f32_e64 v6, v2, 11 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8 +; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; CI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0 +; CI-NEXT: v_rcp_f32_e32 v10, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0 -; CI-NEXT: v_fma_f32 v6, v7, v6, v6 -; CI-NEXT: v_mul_f32_e32 v7, v5, v6 -; CI-NEXT: v_fma_f32 v8, -v4, v7, v5 -; CI-NEXT: v_fma_f32 v7, v8, v6, v7 -; CI-NEXT: v_fma_f32 v4, -v4, v7, v5 +; CI-NEXT: v_fma_f32 v11, -v5, v10, 1.0 +; CI-NEXT: v_fma_f32 v10, v11, v10, v10 +; CI-NEXT: v_mul_f32_e32 v11, v9, v10 +; CI-NEXT: v_fma_f32 v12, -v5, v11, v9 +; CI-NEXT: v_fma_f32 v11, v12, v10, v11 +; CI-NEXT: v_fma_f32 v5, -v5, v11, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 -; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v2 -; CI-NEXT: v_trunc_f32_e32 v4, v4 -; CI-NEXT: v_fma_f32 v2, -v4, v3, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; CI-NEXT: v_div_fmas_f32 v5, v5, v10, v11 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4 +; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_22 +; CI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; CI-NEXT: v_add_i32_e32 v4, vcc, 11, v7 +; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 +; CI-NEXT: .LBB10_21: ; %frem.loop_body60 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v7, v6 +; CI-NEXT: v_mul_f32_e32 v6, v7, v5 +; CI-NEXT: v_rndne_f32_e32 v6, v6 +; CI-NEXT: v_fma_f32 v6, -v6, v3, v7 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; CI-NEXT: v_add_f32_e32 v8, v6, v3 +; CI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4 +; CI-NEXT: v_ldexp_f32_e64 v6, v6, 11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4 +; CI-NEXT: s_cbranch_vccnz .LBB10_21 +; CI-NEXT: s_branch .LBB10_23 +; CI-NEXT: .LBB10_22: +; CI-NEXT: v_mov_b32_e32 v7, v6 +; CI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; CI-NEXT: v_add_i32_e32 v4, vcc, -10, v4 +; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4 +; CI-NEXT: v_mul_f32_e32 v5, v4, v5 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v3, v4, v3 +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_div_scale_f32 v5, s[2:3], v4, v4, v3 -; CI-NEXT: v_div_scale_f32 v6, vcc, v3, v4, v3 -; CI-NEXT: v_rcp_f32_e32 v7, v5 +; CI-NEXT: s_and_b32 s1, s5, 0x8000 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; CI-NEXT: v_or_b32_e32 v2, s1, v2 +; CI-NEXT: .LBB10_24: ; %Flow128 +; CI-NEXT: s_lshr_b32 s7, s5, 16 +; CI-NEXT: s_lshr_b32 s10, s3, 16 +; CI-NEXT: v_cvt_f32_f16_e64 v5, |s7| +; CI-NEXT: v_cvt_f32_f16_e64 v4, |s10| +; CI-NEXT: s_mov_b32 s1, 1 +; CI-NEXT: ; implicit-def: $vgpr3 +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 +; CI-NEXT: s_cbranch_vccz .LBB10_26 +; CI-NEXT: ; %bb.25: ; %frem.else86 +; CI-NEXT: s_and_b32 s1, s7, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v6, s7 +; CI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; CI-NEXT: s_mov_b32 s1, 0 +; CI-NEXT: .LBB10_26: ; %Flow123 +; CI-NEXT: s_xor_b32 s1, s1, 1 +; CI-NEXT: s_cmp_lg_u32 s1, 0 +; CI-NEXT: s_cbranch_scc1 .LBB10_32 +; CI-NEXT: ; %bb.27: ; %frem.compute85 +; CI-NEXT: v_frexp_mant_f32_e32 v6, v4 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4 +; CI-NEXT: v_ldexp_f32_e64 v4, v6, 1 +; CI-NEXT: v_div_scale_f32 v6, s[12:13], v4, v4, 1.0 +; CI-NEXT: v_frexp_mant_f32_e32 v3, v5 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v5 +; CI-NEXT: v_add_i32_e32 v5, vcc, -1, v8 +; CI-NEXT: v_ldexp_f32_e64 v7, v3, 11 +; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v9 +; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v3 +; CI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0 +; CI-NEXT: v_rcp_f32_e32 v11, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 -; CI-NEXT: v_fma_f32 v7, v8, v7, v7 -; CI-NEXT: v_mul_f32_e32 v8, v6, v7 -; CI-NEXT: v_fma_f32 v9, -v5, v8, v6 -; CI-NEXT: v_fma_f32 v8, v9, v7, v8 -; CI-NEXT: v_fma_f32 v5, -v5, v8, v6 +; CI-NEXT: v_fma_f32 v12, -v6, v11, 1.0 +; CI-NEXT: v_fma_f32 v11, v12, v11, v11 +; CI-NEXT: v_mul_f32_e32 v12, v10, v11 +; CI-NEXT: v_fma_f32 v13, -v6, v12, v10 +; CI-NEXT: v_fma_f32 v12, v13, v11, v12 +; CI-NEXT: v_fma_f32 v6, -v6, v12, v10 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 +; CI-NEXT: v_div_fmas_f32 v6, v6, v11, v12 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5 +; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_30 +; CI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; CI-NEXT: v_add_i32_e32 v5, vcc, 11, v8 +; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; CI-NEXT: .LBB10_29: ; %frem.loop_body93 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v8, v7 +; CI-NEXT: v_mul_f32_e32 v7, v8, v6 +; CI-NEXT: v_rndne_f32_e32 v7, v7 +; CI-NEXT: v_fma_f32 v7, -v7, v4, v8 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; CI-NEXT: v_add_f32_e32 v9, v7, v4 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; CI-NEXT: v_add_i32_e32 v5, vcc, -11, v5 +; CI-NEXT: v_ldexp_f32_e64 v7, v7, 11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v5 +; CI-NEXT: s_cbranch_vccnz .LBB10_29 +; CI-NEXT: s_branch .LBB10_31 +; CI-NEXT: .LBB10_30: +; CI-NEXT: v_mov_b32_e32 v8, v7 +; CI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; CI-NEXT: v_add_i32_e32 v5, vcc, -10, v5 +; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5 +; CI-NEXT: v_mul_f32_e32 v6, v5, v6 +; CI-NEXT: v_rndne_f32_e32 v6, v6 +; CI-NEXT: v_fma_f32 v5, -v6, v4, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v4, v5, v4 +; CI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; CI-NEXT: v_ldexp_f32_e32 v3, v4, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: s_and_b32 s1, s7, 0x8000 +; CI-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; CI-NEXT: v_or_b32_e32 v3, s1, v3 +; CI-NEXT: .LBB10_32: ; %Flow124 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v5, 0 +; CI-NEXT: s_and_b32 s1, s4, 0x7fff +; CI-NEXT: s_cmpk_lg_i32 s1, 0x7c00 +; CI-NEXT: s_cselect_b32 s11, 1, 0 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s0 +; CI-NEXT: s_and_b32 s2, s6, 0x7fff +; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 +; CI-NEXT: s_cselect_b32 s6, 1, 0 +; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v4, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s3 +; CI-NEXT: s_and_b32 s4, s5, 0x7fff +; CI-NEXT: s_cmpk_lg_i32 s4, 0x7c00 +; CI-NEXT: s_cselect_b32 s12, 1, 0 +; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], v4, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; CI-NEXT: s_and_b32 s7, s7, 0x7fff +; CI-NEXT: s_cmpk_lg_i32 s7, 0x7c00 +; CI-NEXT: s_cselect_b32 s7, 1, 0 +; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], v4, v5 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_mov_b32_e32 v4, 0x7e00 +; CI-NEXT: s_and_b32 s10, 1, s11 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s10 +; CI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; CI-NEXT: s_and_b32 s0, 1, s6 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v3 -; CI-NEXT: v_trunc_f32_e32 v5, v5 -; CI-NEXT: v_fma_f32 v3, -v5, v4, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; CI-NEXT: v_or_b32_e32 v1, v2, v1 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; CI-NEXT: s_and_b32 s0, 1, s12 +; CI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] +; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; CI-NEXT: s_and_b32 s0, 1, s7 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CI-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] +; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v1, v1, v2 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[18:19], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; VI-NEXT: s_mov_b32 s0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; VI-NEXT: s_lshr_b32 s8, s4, 16 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: s_lshr_b32 s6, s2, 16 -; VI-NEXT: v_rcp_f32_e32 v3, v2 -; VI-NEXT: s_lshr_b32 s9, s5, 16 -; VI-NEXT: s_lshr_b32 s7, s3, 16 -; VI-NEXT: v_mul_f32_e32 v4, v0, v3 -; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 -; VI-NEXT: v_mac_f32_e32 v4, v5, v3 -; VI-NEXT: v_mad_f32 v0, -v2, v4, v0 -; VI-NEXT: v_mul_f32_e32 v0, v0, v3 -; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; VI-NEXT: v_add_f32_e32 v0, v0, v4 +; VI-NEXT: v_cvt_f32_f16_e64 v2, |s8| +; VI-NEXT: v_cvt_f32_f16_e64 v1, |s6| +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 +; VI-NEXT: s_cbranch_vccz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_and_b32 s0, s8, 0x8000 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s8 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: .LBB10_2: ; %Flow135 +; VI-NEXT: s_xor_b32 s0, s0, 1 +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cbranch_scc1 .LBB10_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e32 v3, v1 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 +; VI-NEXT: v_ldexp_f32 v1, v3, 1 +; VI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, 1.0 +; VI-NEXT: v_frexp_mant_f32_e32 v0, v2 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5 +; VI-NEXT: v_ldexp_f32 v4, v0, 11 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v8, v3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; VI-NEXT: v_fma_f32 v8, v9, v8, v8 +; VI-NEXT: v_mul_f32_e32 v9, v7, v8 +; VI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; VI-NEXT: v_fma_f32 v9, v10, v8, v9 +; VI-NEXT: v_fma_f32 v3, -v3, v9, v7 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 +; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB10_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: .LBB10_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: v_mul_f32_e32 v4, v5, v3 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v6, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 +; VI-NEXT: v_ldexp_f32 v4, v4, 11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2 +; VI-NEXT: s_cbranch_vccnz .LBB10_5 +; VI-NEXT: s_branch .LBB10_7 +; VI-NEXT: .LBB10_6: +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: .LBB10_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2 +; VI-NEXT: v_ldexp_f32 v2, v5, v2 +; VI-NEXT: v_mul_f32_e32 v3, v2, v3 +; VI-NEXT: v_rndne_f32_e32 v3, v3 +; VI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; VI-NEXT: v_add_f32_e32 v1, v2, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; VI-NEXT: v_ldexp_f32 v0, v1, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2 -; VI-NEXT: v_trunc_f16_e32 v0, v0 -; VI-NEXT: v_fma_f16 v0, -v0, v1, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; VI-NEXT: v_rcp_f32_e32 v4, v3 -; VI-NEXT: v_mul_f32_e32 v5, v1, v4 -; VI-NEXT: v_mad_f32 v6, -v3, v5, v1 -; VI-NEXT: v_mac_f32_e32 v5, v6, v4 -; VI-NEXT: v_mad_f32 v1, -v3, v5, v1 -; VI-NEXT: v_mul_f32_e32 v1, v1, v4 -; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; VI-NEXT: v_add_f32_e32 v1, v1, v5 +; VI-NEXT: s_and_b32 s0, s8, 0x8000 +; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v0, s0, v0 +; VI-NEXT: .LBB10_8: ; %Flow136 +; VI-NEXT: s_lshr_b32 s4, s8, 16 +; VI-NEXT: s_lshr_b32 s2, s6, 16 +; VI-NEXT: v_cvt_f32_f16_e64 v3, |s4| +; VI-NEXT: v_cvt_f32_f16_e64 v2, |s2| +; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; VI-NEXT: s_cbranch_vccz .LBB10_10 +; VI-NEXT: ; %bb.9: ; %frem.else20 +; VI-NEXT: s_and_b32 s0, s4, 0x8000 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: .LBB10_10: ; %Flow131 +; VI-NEXT: s_xor_b32 s0, s0, 1 +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cbranch_scc1 .LBB10_16 +; VI-NEXT: ; %bb.11: ; %frem.compute19 +; VI-NEXT: v_frexp_mant_f32_e32 v4, v2 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 +; VI-NEXT: v_ldexp_f32 v2, v4, 1 +; VI-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, 1.0 +; VI-NEXT: v_frexp_mant_f32_e32 v1, v3 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6 +; VI-NEXT: v_ldexp_f32 v5, v1, 11 +; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; VI-NEXT: v_rcp_f32_e32 v9, v4 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; VI-NEXT: v_fma_f32 v9, v10, v9, v9 +; VI-NEXT: v_mul_f32_e32 v10, v8, v9 +; VI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; VI-NEXT: v_fma_f32 v10, v11, v9, v10 +; VI-NEXT: v_fma_f32 v4, -v4, v10, v8 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 +; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB10_14 +; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 +; VI-NEXT: .LBB10_13: ; %frem.loop_body27 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: v_mul_f32_e32 v5, v6, v4 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v7, v5, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3 +; VI-NEXT: v_ldexp_f32 v5, v5, 11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3 +; VI-NEXT: s_cbranch_vccnz .LBB10_13 +; VI-NEXT: s_branch .LBB10_15 +; VI-NEXT: .LBB10_14: +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3 +; VI-NEXT: v_ldexp_f32 v3, v6, v3 +; VI-NEXT: v_mul_f32_e32 v4, v3, v4 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; VI-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: v_ldexp_f32 v1, v2, v1 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v4, s5 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6 -; VI-NEXT: v_trunc_f16_e32 v1, v1 -; VI-NEXT: v_fma_f16 v1, -v1, v2, s6 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; VI-NEXT: v_rcp_f32_e32 v5, v4 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_mul_f32_e32 v6, v2, v5 -; VI-NEXT: v_mad_f32 v7, -v4, v6, v2 -; VI-NEXT: v_mac_f32_e32 v6, v7, v5 -; VI-NEXT: v_mad_f32 v2, -v4, v6, v2 -; VI-NEXT: v_mul_f32_e32 v2, v2, v5 -; VI-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; VI-NEXT: v_add_f32_e32 v2, v2, v6 +; VI-NEXT: s_and_b32 s0, s4, 0x8000 +; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v1, s0, v1 +; VI-NEXT: .LBB10_16: ; %Flow132 +; VI-NEXT: v_cvt_f32_f16_e64 v4, |s9| +; VI-NEXT: v_cvt_f32_f16_e64 v3, |s7| +; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 +; VI-NEXT: s_cbranch_vccz .LBB10_18 +; VI-NEXT: ; %bb.17: ; %frem.else53 +; VI-NEXT: s_and_b32 s0, s9, 0x8000 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: .LBB10_18: ; %Flow127 +; VI-NEXT: s_xor_b32 s0, s0, 1 +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cbranch_scc1 .LBB10_24 +; VI-NEXT: ; %bb.19: ; %frem.compute52 +; VI-NEXT: v_frexp_mant_f32_e32 v5, v3 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 +; VI-NEXT: v_ldexp_f32 v3, v5, 1 +; VI-NEXT: v_div_scale_f32 v5, s[0:1], v3, v3, 1.0 +; VI-NEXT: v_frexp_mant_f32_e32 v2, v4 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v7 +; VI-NEXT: v_ldexp_f32 v6, v2, 11 +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v8 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0 +; VI-NEXT: v_rcp_f32_e32 v10, v5 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v11, -v5, v10, 1.0 +; VI-NEXT: v_fma_f32 v10, v11, v10, v10 +; VI-NEXT: v_mul_f32_e32 v11, v9, v10 +; VI-NEXT: v_fma_f32 v12, -v5, v11, v9 +; VI-NEXT: v_fma_f32 v11, v12, v10, v11 +; VI-NEXT: v_fma_f32 v5, -v5, v11, v9 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v5, v5, v10, v11 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4 +; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB10_22 +; VI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; VI-NEXT: v_add_u32_e32 v4, vcc, 11, v7 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 +; VI-NEXT: .LBB10_21: ; %frem.loop_body60 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v7, v6 +; VI-NEXT: v_mul_f32_e32 v6, v7, v5 +; VI-NEXT: v_rndne_f32_e32 v6, v6 +; VI-NEXT: v_fma_f32 v6, -v6, v3, v7 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; VI-NEXT: v_add_f32_e32 v8, v6, v3 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4 +; VI-NEXT: v_ldexp_f32 v6, v6, 11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4 +; VI-NEXT: s_cbranch_vccnz .LBB10_21 +; VI-NEXT: s_branch .LBB10_23 +; VI-NEXT: .LBB10_22: +; VI-NEXT: v_mov_b32_e32 v7, v6 +; VI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; VI-NEXT: v_add_u32_e32 v4, vcc, -10, v4 +; VI-NEXT: v_ldexp_f32 v4, v7, v4 +; VI-NEXT: v_mul_f32_e32 v5, v4, v5 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; VI-NEXT: v_ldexp_f32 v2, v3, v2 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; VI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; VI-NEXT: v_mov_b32_e32 v4, s9 -; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s3 -; VI-NEXT: v_trunc_f16_e32 v2, v2 -; VI-NEXT: v_fma_f16 v2, -v2, v3, s3 -; VI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; VI-NEXT: v_rcp_f32_e32 v6, v5 -; VI-NEXT: v_mul_f32_e32 v7, v3, v6 -; VI-NEXT: v_mad_f32 v8, -v5, v7, v3 -; VI-NEXT: v_mac_f32_e32 v7, v8, v6 -; VI-NEXT: v_mad_f32 v3, -v5, v7, v3 -; VI-NEXT: v_mul_f32_e32 v3, v3, v6 -; VI-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; VI-NEXT: v_add_f32_e32 v3, v3, v7 +; VI-NEXT: s_and_b32 s0, s9, 0x8000 +; VI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v2, s0, v2 +; VI-NEXT: .LBB10_24: ; %Flow128 +; VI-NEXT: s_lshr_b32 s12, s9, 16 +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: v_cvt_f32_f16_e64 v5, |s12| +; VI-NEXT: v_cvt_f32_f16_e64 v4, |s10| +; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 +; VI-NEXT: s_cbranch_vccz .LBB10_26 +; VI-NEXT: ; %bb.25: ; %frem.else86 +; VI-NEXT: s_and_b32 s0, s12, 0x8000 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: .LBB10_26: ; %Flow123 +; VI-NEXT: s_xor_b32 s0, s0, 1 +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cbranch_scc1 .LBB10_32 +; VI-NEXT: ; %bb.27: ; %frem.compute85 +; VI-NEXT: v_frexp_mant_f32_e32 v6, v4 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4 +; VI-NEXT: v_ldexp_f32 v4, v6, 1 +; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, 1.0 +; VI-NEXT: v_frexp_mant_f32_e32 v3, v5 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v5 +; VI-NEXT: v_add_u32_e32 v5, vcc, -1, v8 +; VI-NEXT: v_ldexp_f32 v7, v3, 11 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v9 +; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0 +; VI-NEXT: v_rcp_f32_e32 v11, v6 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v12, -v6, v11, 1.0 +; VI-NEXT: v_fma_f32 v11, v12, v11, v11 +; VI-NEXT: v_mul_f32_e32 v12, v10, v11 +; VI-NEXT: v_fma_f32 v13, -v6, v12, v10 +; VI-NEXT: v_fma_f32 v12, v13, v11, v12 +; VI-NEXT: v_fma_f32 v6, -v6, v12, v10 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v6, v6, v11, v12 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5 +; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB10_30 +; VI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; VI-NEXT: v_add_u32_e32 v5, vcc, 11, v8 +; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9 +; VI-NEXT: .LBB10_29: ; %frem.loop_body93 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v8, v7 +; VI-NEXT: v_mul_f32_e32 v7, v8, v6 +; VI-NEXT: v_rndne_f32_e32 v7, v7 +; VI-NEXT: v_fma_f32 v7, -v7, v4, v8 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; VI-NEXT: v_add_f32_e32 v9, v7, v4 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, -11, v5 +; VI-NEXT: v_ldexp_f32 v7, v7, 11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v5 +; VI-NEXT: s_cbranch_vccnz .LBB10_29 +; VI-NEXT: s_branch .LBB10_31 +; VI-NEXT: .LBB10_30: +; VI-NEXT: v_mov_b32_e32 v8, v7 +; VI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; VI-NEXT: v_add_u32_e32 v5, vcc, -10, v5 +; VI-NEXT: v_ldexp_f32 v5, v8, v5 +; VI-NEXT: v_mul_f32_e32 v6, v5, v6 +; VI-NEXT: v_rndne_f32_e32 v6, v6 +; VI-NEXT: v_fma_f32 v5, -v6, v4, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v4, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; VI-NEXT: v_ldexp_f32 v3, v4, v3 ; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; VI-NEXT: v_div_fixup_f16 v3, v3, v4, s7 -; VI-NEXT: v_trunc_f16_e32 v3, v3 -; VI-NEXT: v_fma_f16 v3, -v3, v4, s7 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_and_b32 s0, s12, 0x8000 +; VI-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v3, s0, v3 +; VI-NEXT: .LBB10_32: ; %Flow124 +; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 +; VI-NEXT: v_cmp_nlg_f16_e64 vcc, s6, 0 +; VI-NEXT: v_cmp_nge_f16_e64 s[0:1], |s8|, v4 +; VI-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, 0 +; VI-NEXT: v_cmp_nge_f16_e64 s[4:5], |s4|, v4 +; VI-NEXT: v_cmp_nge_f16_e64 s[8:9], |s9|, v4 +; VI-NEXT: v_cmp_nge_f16_e64 s[12:13], |s12|, v4 +; VI-NEXT: v_mov_b32_e32 v4, 0x7e00 +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_mov_b64 vcc, s[2:3] +; VI-NEXT: v_cmp_nlg_f16_e64 s[6:7], s7, 0 +; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_mov_b64 vcc, s[4:5] +; VI-NEXT: v_cmp_nlg_f16_e64 s[10:11], s10, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_mov_b64 vcc, s[6:7] +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_mov_b64 vcc, s[10:11] +; VI-NEXT: v_cndmask_b32_sdwa v2, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_mov_b64 vcc, s[12:13] +; VI-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[8:9] +; VI-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4 @@ -791,43 +2138,171 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s6, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2 -; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 -; CI-NEXT: v_rcp_f32_e32 v3, v1 +; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0| +; CI-NEXT: ; implicit-def: $vgpr0 +; CI-NEXT: s_cbranch_vccz .LBB11_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_and_b32 s6, s2, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1| +; CI-NEXT: v_mov_b32_e32 v1, s6 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: .LBB11_2: ; %Flow53 +; CI-NEXT: s_xor_b32 s6, s6, 1 +; CI-NEXT: s_cmp_lg_u32 s6, 0 +; CI-NEXT: s_cbranch_scc1 .LBB11_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e64 v1, |s4| +; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1 +; CI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v0, |s2| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s4| +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5 +; CI-NEXT: v_ldexp_f32_e64 v4, v0, 12 +; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; CI-NEXT: v_rcp_f32_e32 v8, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; CI-NEXT: v_fma_f32 v3, v4, v3, v3 -; CI-NEXT: v_mul_f32_e32 v4, v2, v3 -; CI-NEXT: v_fma_f32 v5, -v1, v4, v2 -; CI-NEXT: v_fma_f32 v4, v5, v3, v4 -; CI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; CI-NEXT: v_fma_f32 v8, v9, v8, v8 +; CI-NEXT: v_mul_f32_e32 v9, v7, v8 +; CI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; CI-NEXT: v_fma_f32 v9, v10, v8, v9 +; CI-NEXT: v_fma_f32 v3, -v3, v9, v7 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s2 -; CI-NEXT: v_trunc_f32_e32 v1, v1 -; CI-NEXT: v_fma_f32 v0, -v1, v0, s2 +; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 +; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB11_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; CI-NEXT: .LBB11_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: v_mul_f32_e32 v4, v5, v3 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v6, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, -12, v2 +; CI-NEXT: v_ldexp_f32_e64 v4, v4, 12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2 +; CI-NEXT: s_cbranch_vccnz .LBB11_5 +; CI-NEXT: s_branch .LBB11_7 +; CI-NEXT: .LBB11_6: +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: .LBB11_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 +; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 +; CI-NEXT: v_mul_f32_e32 v3, v2, v3 +; CI-NEXT: v_rndne_f32_e32 v3, v3 +; CI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; CI-NEXT: v_add_f32_e32 v1, v2, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; CI-NEXT: s_and_b32 s6, s2, 0x80000000 +; CI-NEXT: v_or_b32_e32 v0, s6, v0 +; CI-NEXT: .LBB11_8: ; %Flow54 ; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, s3 -; CI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3 -; CI-NEXT: v_rcp_f32_e32 v4, v2 +; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s3|, |v1| +; CI-NEXT: s_mov_b32 s6, 1 +; CI-NEXT: ; implicit-def: $vgpr1 +; CI-NEXT: s_cbranch_vccz .LBB11_10 +; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: s_and_b32 s6, s3, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v2, s5 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2| +; CI-NEXT: v_mov_b32_e32 v2, s6 +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: .LBB11_10: ; %Flow49 +; CI-NEXT: s_xor_b32 s6, s6, 1 +; CI-NEXT: s_cmp_lg_u32 s6, 0 +; CI-NEXT: s_cbranch_scc1 .LBB11_16 +; CI-NEXT: ; %bb.11: ; %frem.compute15 +; CI-NEXT: v_frexp_mant_f32_e64 v2, |s5| +; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1 +; CI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v1, |s3| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s5| +; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6 +; CI-NEXT: v_ldexp_f32_e64 v5, v1, 12 +; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1 +; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; CI-NEXT: v_rcp_f32_e32 v9, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v3, v4 -; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 +; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; CI-NEXT: v_fma_f32 v9, v10, v9, v9 +; CI-NEXT: v_mul_f32_e32 v10, v8, v9 +; CI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; CI-NEXT: v_fma_f32 v10, v11, v9, v10 +; CI-NEXT: v_fma_f32 v4, -v4, v10, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 +; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB11_14 +; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; CI-NEXT: .LBB11_13: ; %frem.loop_body23 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: v_mul_f32_e32 v5, v6, v4 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v7, v5, v2 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; CI-NEXT: v_add_i32_e32 v3, vcc, -12, v3 +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3 +; CI-NEXT: s_cbranch_vccnz .LBB11_13 +; CI-NEXT: s_branch .LBB11_15 +; CI-NEXT: .LBB11_14: +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3 +; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 +; CI-NEXT: v_mul_f32_e32 v4, v3, v4 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; CI-NEXT: v_add_f32_e32 v2, v3, v2 +; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CI-NEXT: s_and_b32 s6, s3, 0x80000000 +; CI-NEXT: v_or_b32_e32 v1, s6, v1 +; CI-NEXT: .LBB11_16: ; %Flow50 +; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; CI-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s2|, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s5, 0 +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s3|, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s3 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v1, -v2, v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -836,42 +2311,170 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s6, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2 -; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 -; VI-NEXT: v_rcp_f32_e32 v3, v1 +; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0| +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: s_cbranch_vccz .LBB11_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_and_b32 s6, s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1| +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: s_mov_b32 s6, 0 +; VI-NEXT: .LBB11_2: ; %Flow53 +; VI-NEXT: s_xor_b32 s6, s6, 1 +; VI-NEXT: s_cmp_lg_u32 s6, 0 +; VI-NEXT: s_cbranch_scc1 .LBB11_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e64 v1, |s4| +; VI-NEXT: v_ldexp_f32 v1, v1, 1 +; VI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v0, |s2| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s4| +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5 +; VI-NEXT: v_ldexp_f32 v4, v0, 12 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v8, v3 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; VI-NEXT: v_fma_f32 v3, v4, v3, v3 -; VI-NEXT: v_mul_f32_e32 v4, v2, v3 -; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 -; VI-NEXT: v_fma_f32 v4, v5, v3, v4 -; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; VI-NEXT: v_fma_f32 v8, v9, v8, v8 +; VI-NEXT: v_mul_f32_e32 v9, v7, v8 +; VI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; VI-NEXT: v_fma_f32 v9, v10, v8, v9 +; VI-NEXT: v_fma_f32 v3, -v3, v9, v7 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2 -; VI-NEXT: v_trunc_f32_e32 v1, v1 -; VI-NEXT: v_fma_f32 v0, -v1, v0, s2 +; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 +; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB11_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: .LBB11_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: v_mul_f32_e32 v4, v5, v3 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v6, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, -12, v2 +; VI-NEXT: v_ldexp_f32 v4, v4, 12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2 +; VI-NEXT: s_cbranch_vccnz .LBB11_5 +; VI-NEXT: s_branch .LBB11_7 +; VI-NEXT: .LBB11_6: +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: .LBB11_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 +; VI-NEXT: v_ldexp_f32 v2, v5, v2 +; VI-NEXT: v_mul_f32_e32 v3, v2, v3 +; VI-NEXT: v_rndne_f32_e32 v3, v3 +; VI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; VI-NEXT: v_add_f32_e32 v1, v2, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; VI-NEXT: s_and_b32 s6, s2, 0x80000000 +; VI-NEXT: v_or_b32_e32 v0, s6, v0 +; VI-NEXT: .LBB11_8: ; %Flow54 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, s3 -; VI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3 -; VI-NEXT: v_rcp_f32_e32 v4, v2 +; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s3|, |v1| +; VI-NEXT: s_mov_b32 s6, 1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: s_cbranch_vccz .LBB11_10 +; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: s_and_b32 s6, s3, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2| +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: s_mov_b32 s6, 0 +; VI-NEXT: .LBB11_10: ; %Flow49 +; VI-NEXT: s_xor_b32 s6, s6, 1 +; VI-NEXT: s_cmp_lg_u32 s6, 0 +; VI-NEXT: s_cbranch_scc1 .LBB11_16 +; VI-NEXT: ; %bb.11: ; %frem.compute15 +; VI-NEXT: v_frexp_mant_f32_e64 v2, |s5| +; VI-NEXT: v_ldexp_f32 v2, v2, 1 +; VI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v1, |s3| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s5| +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6 +; VI-NEXT: v_ldexp_f32 v5, v1, 12 +; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; VI-NEXT: v_rcp_f32_e32 v9, v4 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; VI-NEXT: v_fma_f32 v4, v5, v4, v4 -; VI-NEXT: v_mul_f32_e32 v5, v3, v4 -; VI-NEXT: v_fma_f32 v6, -v2, v5, v3 -; VI-NEXT: v_fma_f32 v5, v6, v4, v5 -; VI-NEXT: v_fma_f32 v2, -v2, v5, v3 +; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; VI-NEXT: v_fma_f32 v9, v10, v9, v9 +; VI-NEXT: v_mul_f32_e32 v10, v8, v9 +; VI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; VI-NEXT: v_fma_f32 v10, v11, v9, v10 +; VI-NEXT: v_fma_f32 v4, -v4, v10, v8 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s3 -; VI-NEXT: v_trunc_f32_e32 v2, v2 -; VI-NEXT: v_fma_f32 v1, -v2, v1, s3 +; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 +; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB11_14 +; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 +; VI-NEXT: .LBB11_13: ; %frem.loop_body23 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: v_mul_f32_e32 v5, v6, v4 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v7, v5, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, -12, v3 +; VI-NEXT: v_ldexp_f32 v5, v5, 12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3 +; VI-NEXT: s_cbranch_vccnz .LBB11_13 +; VI-NEXT: s_branch .LBB11_15 +; VI-NEXT: .LBB11_14: +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3 +; VI-NEXT: v_ldexp_f32 v3, v6, v3 +; VI-NEXT: v_mul_f32_e32 v4, v3, v4 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; VI-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: v_ldexp_f32 v1, v2, v1 +; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; VI-NEXT: s_and_b32 s6, s3, 0x80000000 +; VI-NEXT: v_or_b32_e32 v1, s6, v1 +; VI-NEXT: .LBB11_16: ; %Flow50 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; VI-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s2|, v3 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s5, 0 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s3|, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -892,73 +2495,327 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 +; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s4 -; CI-NEXT: v_div_scale_f32 v2, vcc, s4, v0, s4 -; CI-NEXT: v_rcp_f32_e32 v3, v1 +; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0| +; CI-NEXT: ; implicit-def: $vgpr0 +; CI-NEXT: s_cbranch_vccz .LBB12_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_and_b32 s2, s4, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v1, s8 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1| +; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: .LBB12_2: ; %Flow127 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 +; CI-NEXT: s_cbranch_scc1 .LBB12_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e64 v1, |s8| +; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1 +; CI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v0, |s4| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s4| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s8| +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5 +; CI-NEXT: v_ldexp_f32_e64 v4, v0, 12 +; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; CI-NEXT: v_rcp_f32_e32 v8, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; CI-NEXT: v_fma_f32 v3, v4, v3, v3 -; CI-NEXT: v_mul_f32_e32 v4, v2, v3 -; CI-NEXT: v_fma_f32 v5, -v1, v4, v2 -; CI-NEXT: v_fma_f32 v4, v5, v3, v4 -; CI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; CI-NEXT: v_fma_f32 v8, v9, v8, v8 +; CI-NEXT: v_mul_f32_e32 v9, v7, v8 +; CI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; CI-NEXT: v_fma_f32 v9, v10, v8, v9 +; CI-NEXT: v_fma_f32 v3, -v3, v9, v7 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s4 -; CI-NEXT: v_trunc_f32_e32 v1, v1 -; CI-NEXT: v_fma_f32 v0, -v1, v0, s4 +; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 +; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB12_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; CI-NEXT: .LBB12_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: v_mul_f32_e32 v4, v5, v3 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v6, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, -12, v2 +; CI-NEXT: v_ldexp_f32_e64 v4, v4, 12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2 +; CI-NEXT: s_cbranch_vccnz .LBB12_5 +; CI-NEXT: s_branch .LBB12_7 +; CI-NEXT: .LBB12_6: +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: .LBB12_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 +; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 +; CI-NEXT: v_mul_f32_e32 v3, v2, v3 +; CI-NEXT: v_rndne_f32_e32 v3, v3 +; CI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; CI-NEXT: v_add_f32_e32 v1, v2, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; CI-NEXT: s_and_b32 s2, s4, 0x80000000 +; CI-NEXT: v_or_b32_e32 v0, s2, v0 +; CI-NEXT: .LBB12_8: ; %Flow128 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s5 -; CI-NEXT: v_div_scale_f32 v3, vcc, s5, v1, s5 -; CI-NEXT: v_rcp_f32_e32 v4, v2 +; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s5|, |v1| +; CI-NEXT: s_mov_b32 s2, 1 +; CI-NEXT: ; implicit-def: $vgpr1 +; CI-NEXT: s_cbranch_vccz .LBB12_10 +; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: s_and_b32 s2, s5, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v2, s9 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2| +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: .LBB12_10: ; %Flow123 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 +; CI-NEXT: s_cbranch_scc1 .LBB12_16 +; CI-NEXT: ; %bb.11: ; %frem.compute15 +; CI-NEXT: v_frexp_mant_f32_e64 v2, |s9| +; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1 +; CI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v1, |s5| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s5| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s9| +; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6 +; CI-NEXT: v_ldexp_f32_e64 v5, v1, 12 +; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1 +; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; CI-NEXT: v_rcp_f32_e32 v9, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v3, v4 -; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 +; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; CI-NEXT: v_fma_f32 v9, v10, v9, v9 +; CI-NEXT: v_mul_f32_e32 v10, v8, v9 +; CI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; CI-NEXT: v_fma_f32 v10, v11, v9, v10 +; CI-NEXT: v_fma_f32 v4, -v4, v10, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s5 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v1, -v2, v1, s5 +; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 +; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB12_14 +; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; CI-NEXT: .LBB12_13: ; %frem.loop_body23 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: v_mul_f32_e32 v5, v6, v4 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v7, v5, v2 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; CI-NEXT: v_add_i32_e32 v3, vcc, -12, v3 +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3 +; CI-NEXT: s_cbranch_vccnz .LBB12_13 +; CI-NEXT: s_branch .LBB12_15 +; CI-NEXT: .LBB12_14: +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3 +; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 +; CI-NEXT: v_mul_f32_e32 v4, v3, v4 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; CI-NEXT: v_add_f32_e32 v2, v3, v2 +; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CI-NEXT: s_and_b32 s2, s5, 0x80000000 +; CI-NEXT: v_or_b32_e32 v1, s2, v1 +; CI-NEXT: .LBB12_16: ; %Flow124 ; CI-NEXT: v_mov_b32_e32 v2, s10 -; CI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, s6 -; CI-NEXT: v_div_scale_f32 v4, vcc, s6, v2, s6 -; CI-NEXT: v_rcp_f32_e32 v5, v3 +; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s6|, |v2| +; CI-NEXT: s_mov_b32 s2, 1 +; CI-NEXT: ; implicit-def: $vgpr2 +; CI-NEXT: s_cbranch_vccz .LBB12_18 +; CI-NEXT: ; %bb.17: ; %frem.else47 +; CI-NEXT: s_and_b32 s2, s6, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v3, s10 +; CI-NEXT: v_mov_b32_e32 v2, s6 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3| +; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: .LBB12_18: ; %Flow119 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 +; CI-NEXT: s_cbranch_scc1 .LBB12_24 +; CI-NEXT: ; %bb.19: ; %frem.compute46 +; CI-NEXT: v_frexp_mant_f32_e64 v3, |s10| +; CI-NEXT: v_ldexp_f32_e64 v3, v3, 1 +; CI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v2, |s6| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s6| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s10| +; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v7 +; CI-NEXT: v_ldexp_f32_e64 v6, v2, 12 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8 +; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; CI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0 +; CI-NEXT: v_rcp_f32_e32 v10, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; CI-NEXT: v_fma_f32 v5, v6, v5, v5 -; CI-NEXT: v_mul_f32_e32 v6, v4, v5 -; CI-NEXT: v_fma_f32 v7, -v3, v6, v4 -; CI-NEXT: v_fma_f32 v6, v7, v5, v6 -; CI-NEXT: v_fma_f32 v3, -v3, v6, v4 +; CI-NEXT: v_fma_f32 v11, -v5, v10, 1.0 +; CI-NEXT: v_fma_f32 v10, v11, v10, v10 +; CI-NEXT: v_mul_f32_e32 v11, v9, v10 +; CI-NEXT: v_fma_f32 v12, -v5, v11, v9 +; CI-NEXT: v_fma_f32 v11, v12, v10, v11 +; CI-NEXT: v_fma_f32 v5, -v5, v11, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; CI-NEXT: v_div_fixup_f32 v3, v3, v2, s6 -; CI-NEXT: v_trunc_f32_e32 v3, v3 -; CI-NEXT: v_fma_f32 v2, -v3, v2, s6 +; CI-NEXT: v_div_fmas_f32 v5, v5, v10, v11 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4 +; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB12_22 +; CI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; CI-NEXT: v_add_i32_e32 v4, vcc, 12, v7 +; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 +; CI-NEXT: .LBB12_21: ; %frem.loop_body54 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v7, v6 +; CI-NEXT: v_mul_f32_e32 v6, v7, v5 +; CI-NEXT: v_rndne_f32_e32 v6, v6 +; CI-NEXT: v_fma_f32 v6, -v6, v3, v7 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; CI-NEXT: v_add_f32_e32 v8, v6, v3 +; CI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, -12, v4 +; CI-NEXT: v_ldexp_f32_e64 v6, v6, 12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4 +; CI-NEXT: s_cbranch_vccnz .LBB12_21 +; CI-NEXT: s_branch .LBB12_23 +; CI-NEXT: .LBB12_22: +; CI-NEXT: v_mov_b32_e32 v7, v6 +; CI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4 +; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4 +; CI-NEXT: v_mul_f32_e32 v5, v4, v5 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v3, v4, v3 +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2 +; CI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; CI-NEXT: s_and_b32 s2, s6, 0x80000000 +; CI-NEXT: v_or_b32_e32 v2, s2, v2 +; CI-NEXT: .LBB12_24: ; %Flow120 ; CI-NEXT: v_mov_b32_e32 v3, s11 -; CI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, s7 -; CI-NEXT: v_div_scale_f32 v5, vcc, s7, v3, s7 -; CI-NEXT: v_rcp_f32_e32 v6, v4 +; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s7|, |v3| +; CI-NEXT: s_mov_b32 s2, 1 +; CI-NEXT: ; implicit-def: $vgpr3 +; CI-NEXT: s_cbranch_vccz .LBB12_26 +; CI-NEXT: ; %bb.25: ; %frem.else78 +; CI-NEXT: s_and_b32 s2, s7, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v4, s11 +; CI-NEXT: v_mov_b32_e32 v3, s7 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4| +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: .LBB12_26: ; %Flow115 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 +; CI-NEXT: s_cbranch_scc1 .LBB12_32 +; CI-NEXT: ; %bb.27: ; %frem.compute77 +; CI-NEXT: v_frexp_mant_f32_e64 v4, |s11| +; CI-NEXT: v_ldexp_f32_e64 v4, v4, 1 +; CI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v3, |s7| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s7| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v9, |s11| +; CI-NEXT: v_add_i32_e32 v5, vcc, -1, v8 +; CI-NEXT: v_ldexp_f32_e64 v7, v3, 12 +; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v9 +; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v3 +; CI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0 +; CI-NEXT: v_rcp_f32_e32 v11, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0 -; CI-NEXT: v_fma_f32 v6, v7, v6, v6 -; CI-NEXT: v_mul_f32_e32 v7, v5, v6 -; CI-NEXT: v_fma_f32 v8, -v4, v7, v5 -; CI-NEXT: v_fma_f32 v7, v8, v6, v7 -; CI-NEXT: v_fma_f32 v4, -v4, v7, v5 +; CI-NEXT: v_fma_f32 v12, -v6, v11, 1.0 +; CI-NEXT: v_fma_f32 v11, v12, v11, v11 +; CI-NEXT: v_mul_f32_e32 v12, v10, v11 +; CI-NEXT: v_fma_f32 v13, -v6, v12, v10 +; CI-NEXT: v_fma_f32 v12, v13, v11, v12 +; CI-NEXT: v_fma_f32 v6, -v6, v12, v10 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; CI-NEXT: v_div_fmas_f32 v6, v6, v11, v12 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5 +; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB12_30 +; CI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; CI-NEXT: v_add_i32_e32 v5, vcc, 12, v8 +; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; CI-NEXT: .LBB12_29: ; %frem.loop_body85 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v8, v7 +; CI-NEXT: v_mul_f32_e32 v7, v8, v6 +; CI-NEXT: v_rndne_f32_e32 v7, v7 +; CI-NEXT: v_fma_f32 v7, -v7, v4, v8 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; CI-NEXT: v_add_f32_e32 v9, v7, v4 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; CI-NEXT: v_add_i32_e32 v5, vcc, -12, v5 +; CI-NEXT: v_ldexp_f32_e64 v7, v7, 12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v5 +; CI-NEXT: s_cbranch_vccnz .LBB12_29 +; CI-NEXT: s_branch .LBB12_31 +; CI-NEXT: .LBB12_30: +; CI-NEXT: v_mov_b32_e32 v8, v7 +; CI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; CI-NEXT: v_add_i32_e32 v5, vcc, -11, v5 +; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5 +; CI-NEXT: v_mul_f32_e32 v6, v5, v6 +; CI-NEXT: v_rndne_f32_e32 v6, v6 +; CI-NEXT: v_fma_f32 v5, -v6, v4, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v4, v5, v4 +; CI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; CI-NEXT: v_ldexp_f32_e32 v3, v4, v3 +; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; CI-NEXT: s_and_b32 s2, s7, 0x80000000 +; CI-NEXT: v_or_b32_e32 v3, s2, v3 +; CI-NEXT: .LBB12_32: ; %Flow116 +; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s8, 0 +; CI-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; CI-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s4|, v5 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s9, 0 +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s5|, v5 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s10, 0 +; CI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s6|, v5 +; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s11, 0 +; CI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s7|, v5 +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_div_fixup_f32 v4, v4, v3, s7 -; CI-NEXT: v_trunc_f32_e32 v4, v4 -; CI-NEXT: v_fma_f32 v3, -v4, v3, s7 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; @@ -969,71 +2826,325 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s4 -; VI-NEXT: v_div_scale_f32 v2, vcc, s4, v0, s4 -; VI-NEXT: v_rcp_f32_e32 v3, v1 +; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0| +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: s_cbranch_vccz .LBB12_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1| +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB12_2: ; %Flow127 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB12_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e64 v1, |s8| +; VI-NEXT: v_ldexp_f32 v1, v1, 1 +; VI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v0, |s4| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s4| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s8| +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5 +; VI-NEXT: v_ldexp_f32 v4, v0, 12 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v8, v3 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; VI-NEXT: v_fma_f32 v3, v4, v3, v3 -; VI-NEXT: v_mul_f32_e32 v4, v2, v3 -; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 -; VI-NEXT: v_fma_f32 v4, v5, v3, v4 -; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; VI-NEXT: v_fma_f32 v8, v9, v8, v8 +; VI-NEXT: v_mul_f32_e32 v9, v7, v8 +; VI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; VI-NEXT: v_fma_f32 v9, v10, v8, v9 +; VI-NEXT: v_fma_f32 v3, -v3, v9, v7 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s4 -; VI-NEXT: v_trunc_f32_e32 v1, v1 -; VI-NEXT: v_fma_f32 v0, -v1, v0, s4 +; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 +; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB12_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: .LBB12_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: v_mul_f32_e32 v4, v5, v3 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v6, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, -12, v2 +; VI-NEXT: v_ldexp_f32 v4, v4, 12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2 +; VI-NEXT: s_cbranch_vccnz .LBB12_5 +; VI-NEXT: s_branch .LBB12_7 +; VI-NEXT: .LBB12_6: +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: .LBB12_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 +; VI-NEXT: v_ldexp_f32 v2, v5, v2 +; VI-NEXT: v_mul_f32_e32 v3, v2, v3 +; VI-NEXT: v_rndne_f32_e32 v3, v3 +; VI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; VI-NEXT: v_add_f32_e32 v1, v2, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: v_or_b32_e32 v0, s2, v0 +; VI-NEXT: .LBB12_8: ; %Flow128 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s5 -; VI-NEXT: v_div_scale_f32 v3, vcc, s5, v1, s5 -; VI-NEXT: v_rcp_f32_e32 v4, v2 +; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s5|, |v1| +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: s_cbranch_vccz .LBB12_10 +; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: s_and_b32 s2, s5, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v2, s9 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2| +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB12_10: ; %Flow123 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB12_16 +; VI-NEXT: ; %bb.11: ; %frem.compute15 +; VI-NEXT: v_frexp_mant_f32_e64 v2, |s9| +; VI-NEXT: v_ldexp_f32 v2, v2, 1 +; VI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v1, |s5| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s5| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s9| +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6 +; VI-NEXT: v_ldexp_f32 v5, v1, 12 +; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; VI-NEXT: v_rcp_f32_e32 v9, v4 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; VI-NEXT: v_fma_f32 v4, v5, v4, v4 -; VI-NEXT: v_mul_f32_e32 v5, v3, v4 -; VI-NEXT: v_fma_f32 v6, -v2, v5, v3 -; VI-NEXT: v_fma_f32 v5, v6, v4, v5 -; VI-NEXT: v_fma_f32 v2, -v2, v5, v3 +; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; VI-NEXT: v_fma_f32 v9, v10, v9, v9 +; VI-NEXT: v_mul_f32_e32 v10, v8, v9 +; VI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; VI-NEXT: v_fma_f32 v10, v11, v9, v10 +; VI-NEXT: v_fma_f32 v4, -v4, v10, v8 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s5 -; VI-NEXT: v_trunc_f32_e32 v2, v2 -; VI-NEXT: v_fma_f32 v1, -v2, v1, s5 +; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 +; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB12_14 +; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 +; VI-NEXT: .LBB12_13: ; %frem.loop_body23 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: v_mul_f32_e32 v5, v6, v4 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v7, v5, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, -12, v3 +; VI-NEXT: v_ldexp_f32 v5, v5, 12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3 +; VI-NEXT: s_cbranch_vccnz .LBB12_13 +; VI-NEXT: s_branch .LBB12_15 +; VI-NEXT: .LBB12_14: +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3 +; VI-NEXT: v_ldexp_f32 v3, v6, v3 +; VI-NEXT: v_mul_f32_e32 v4, v3, v4 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; VI-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: v_ldexp_f32 v1, v2, v1 +; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; VI-NEXT: s_and_b32 s2, s5, 0x80000000 +; VI-NEXT: v_or_b32_e32 v1, s2, v1 +; VI-NEXT: .LBB12_16: ; %Flow124 ; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, s6 -; VI-NEXT: v_div_scale_f32 v4, vcc, s6, v2, s6 -; VI-NEXT: v_rcp_f32_e32 v5, v3 +; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s6|, |v2| +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: s_cbranch_vccz .LBB12_18 +; VI-NEXT: ; %bb.17: ; %frem.else47 +; VI-NEXT: s_and_b32 s2, s6, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v3, s10 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3| +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB12_18: ; %Flow119 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB12_24 +; VI-NEXT: ; %bb.19: ; %frem.compute46 +; VI-NEXT: v_frexp_mant_f32_e64 v3, |s10| +; VI-NEXT: v_ldexp_f32 v3, v3, 1 +; VI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v2, |s6| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s6| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s10| +; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v7 +; VI-NEXT: v_ldexp_f32 v6, v2, 12 +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v8 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0 +; VI-NEXT: v_rcp_f32_e32 v10, v5 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; VI-NEXT: v_fma_f32 v5, v6, v5, v5 -; VI-NEXT: v_mul_f32_e32 v6, v4, v5 -; VI-NEXT: v_fma_f32 v7, -v3, v6, v4 -; VI-NEXT: v_fma_f32 v6, v7, v5, v6 -; VI-NEXT: v_fma_f32 v3, -v3, v6, v4 +; VI-NEXT: v_fma_f32 v11, -v5, v10, 1.0 +; VI-NEXT: v_fma_f32 v10, v11, v10, v10 +; VI-NEXT: v_mul_f32_e32 v11, v9, v10 +; VI-NEXT: v_fma_f32 v12, -v5, v11, v9 +; VI-NEXT: v_fma_f32 v11, v12, v10, v11 +; VI-NEXT: v_fma_f32 v5, -v5, v11, v9 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; VI-NEXT: v_div_fixup_f32 v3, v3, v2, s6 -; VI-NEXT: v_trunc_f32_e32 v3, v3 -; VI-NEXT: v_fma_f32 v2, -v3, v2, s6 +; VI-NEXT: v_div_fmas_f32 v5, v5, v10, v11 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4 +; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB12_22 +; VI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; VI-NEXT: v_add_u32_e32 v4, vcc, 12, v7 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 +; VI-NEXT: .LBB12_21: ; %frem.loop_body54 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v7, v6 +; VI-NEXT: v_mul_f32_e32 v6, v7, v5 +; VI-NEXT: v_rndne_f32_e32 v6, v6 +; VI-NEXT: v_fma_f32 v6, -v6, v3, v7 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; VI-NEXT: v_add_f32_e32 v8, v6, v3 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, -12, v4 +; VI-NEXT: v_ldexp_f32 v6, v6, 12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4 +; VI-NEXT: s_cbranch_vccnz .LBB12_21 +; VI-NEXT: s_branch .LBB12_23 +; VI-NEXT: .LBB12_22: +; VI-NEXT: v_mov_b32_e32 v7, v6 +; VI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4 +; VI-NEXT: v_ldexp_f32 v4, v7, v4 +; VI-NEXT: v_mul_f32_e32 v5, v4, v5 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; VI-NEXT: v_ldexp_f32 v2, v3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; VI-NEXT: s_and_b32 s2, s6, 0x80000000 +; VI-NEXT: v_or_b32_e32 v2, s2, v2 +; VI-NEXT: .LBB12_24: ; %Flow120 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, s7 -; VI-NEXT: v_div_scale_f32 v5, vcc, s7, v3, s7 -; VI-NEXT: v_rcp_f32_e32 v6, v4 +; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s7|, |v3| +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_cbranch_vccz .LBB12_26 +; VI-NEXT: ; %bb.25: ; %frem.else78 +; VI-NEXT: s_and_b32 s2, s7, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v4, s11 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4| +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB12_26: ; %Flow115 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB12_32 +; VI-NEXT: ; %bb.27: ; %frem.compute77 +; VI-NEXT: v_frexp_mant_f32_e64 v4, |s11| +; VI-NEXT: v_ldexp_f32 v4, v4, 1 +; VI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v3, |s7| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s7| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v9, |s11| +; VI-NEXT: v_add_u32_e32 v5, vcc, -1, v8 +; VI-NEXT: v_ldexp_f32 v7, v3, 12 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v9 +; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0 +; VI-NEXT: v_rcp_f32_e32 v11, v6 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v7, -v4, v6, 1.0 -; VI-NEXT: v_fma_f32 v6, v7, v6, v6 -; VI-NEXT: v_mul_f32_e32 v7, v5, v6 -; VI-NEXT: v_fma_f32 v8, -v4, v7, v5 -; VI-NEXT: v_fma_f32 v7, v8, v6, v7 -; VI-NEXT: v_fma_f32 v4, -v4, v7, v5 +; VI-NEXT: v_fma_f32 v12, -v6, v11, 1.0 +; VI-NEXT: v_fma_f32 v11, v12, v11, v11 +; VI-NEXT: v_mul_f32_e32 v12, v10, v11 +; VI-NEXT: v_fma_f32 v13, -v6, v12, v10 +; VI-NEXT: v_fma_f32 v12, v13, v11, v12 +; VI-NEXT: v_fma_f32 v6, -v6, v12, v10 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 -; VI-NEXT: v_div_fixup_f32 v4, v4, v3, s7 -; VI-NEXT: v_trunc_f32_e32 v4, v4 -; VI-NEXT: v_fma_f32 v3, -v4, v3, s7 +; VI-NEXT: v_div_fmas_f32 v6, v6, v11, v12 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5 +; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB12_30 +; VI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; VI-NEXT: v_add_u32_e32 v5, vcc, 12, v8 +; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9 +; VI-NEXT: .LBB12_29: ; %frem.loop_body85 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v8, v7 +; VI-NEXT: v_mul_f32_e32 v7, v8, v6 +; VI-NEXT: v_rndne_f32_e32 v7, v7 +; VI-NEXT: v_fma_f32 v7, -v7, v4, v8 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; VI-NEXT: v_add_f32_e32 v9, v7, v4 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, -12, v5 +; VI-NEXT: v_ldexp_f32 v7, v7, 12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v5 +; VI-NEXT: s_cbranch_vccnz .LBB12_29 +; VI-NEXT: s_branch .LBB12_31 +; VI-NEXT: .LBB12_30: +; VI-NEXT: v_mov_b32_e32 v8, v7 +; VI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; VI-NEXT: v_add_u32_e32 v5, vcc, -11, v5 +; VI-NEXT: v_ldexp_f32 v5, v8, v5 +; VI-NEXT: v_mul_f32_e32 v6, v5, v6 +; VI-NEXT: v_rndne_f32_e32 v6, v6 +; VI-NEXT: v_fma_f32 v5, -v6, v4, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v4, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; VI-NEXT: v_ldexp_f32 v3, v4, v3 +; VI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; VI-NEXT: s_and_b32 s2, s7, 0x80000000 +; VI-NEXT: v_or_b32_e32 v3, s2, v3 +; VI-NEXT: .LBB12_32: ; %Flow116 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s8, 0 +; VI-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; VI-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s4|, v5 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s9, 0 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s5|, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s10, 0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s6|, v5 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s11, 0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s7|, v5 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1054,39 +3165,198 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 +; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5] -; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5] -; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5] -; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] -; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5] +; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]| +; CI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CI-NEXT: s_cbranch_vccz .LBB13_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]| +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_brev_b32 s3, 1 +; CI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: .LBB13_2: ; %Flow53 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 +; CI-NEXT: s_cbranch_scc1 .LBB13_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]| +; CI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]| +; CI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]| +; CI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26 +; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[8:9]| +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v6 +; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v7 +; CI-NEXT: v_sub_i32_e32 v9, vcc, v2, v8 +; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1 +; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], 1.0 +; CI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0 +; CI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3] +; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; CI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15] +; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13] +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9 +; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB13_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v6, vcc, 26, v6 +; CI-NEXT: v_sub_i32_e32 v9, vcc, v6, v7 +; CI-NEXT: .LBB13_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mov_b32_e32 v6, v4 +; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3] +; CI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5] +; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; CI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1] +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; CI-NEXT: v_add_i32_e32 v9, vcc, 0xffffffe6, v9 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9 +; CI-NEXT: s_cbranch_vccnz .LBB13_5 +; CI-NEXT: s_branch .LBB13_7 +; CI-NEXT: .LBB13_6: +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mov_b32_e32 v6, v4 +; CI-NEXT: .LBB13_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe7, v9 +; CI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_brev_b32 s3, 1 +; CI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; CI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3] +; CI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3] +; CI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 +; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CI-NEXT: v_or_b32_e32 v0, s2, v0 +; CI-NEXT: v_or_b32_e32 v1, s3, v1 +; CI-NEXT: .LBB13_8: ; %Flow54 +; CI-NEXT: v_mov_b32_e32 v2, s10 +; CI-NEXT: v_mov_b32_e32 v3, s11 +; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[6:7]|, |v[2:3]| +; CI-NEXT: s_mov_b32 s2, 1 +; CI-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CI-NEXT: s_cbranch_vccz .LBB13_10 +; CI-NEXT: ; %bb.9: ; %frem.else16 ; CI-NEXT: v_mov_b32_e32 v2, s10 ; CI-NEXT: v_mov_b32_e32 v3, s11 -; CI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7] -; CI-NEXT: v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7] +; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]| +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_brev_b32 s3, 1 +; CI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3] +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_mov_b32_e32 v4, s6 +; CI-NEXT: v_mov_b32_e32 v5, s7 +; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: .LBB13_10: ; %Flow49 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 +; CI-NEXT: s_cbranch_scc1 .LBB13_16 +; CI-NEXT: ; %bb.11: ; %frem.compute15 +; CI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]| +; CI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]| +; CI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]| +; CI-NEXT: v_ldexp_f64 v[6:7], v[2:3], 26 +; CI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[10:11]| +; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v8 +; CI-NEXT: v_add_i32_e32 v10, vcc, -1, v9 +; CI-NEXT: v_sub_i32_e32 v11, vcc, v4, v10 +; CI-NEXT: v_ldexp_f64 v[2:3], v[2:3], 1 +; CI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], 1.0 +; CI-NEXT: v_div_scale_f64 v[16:17], vcc, 1.0, v[2:3], 1.0 +; CI-NEXT: v_rcp_f64_e32 v[12:13], v[4:5] +; CI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0 +; CI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; CI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0 +; CI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; CI-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] +; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[12:13], v[14:15] +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11 +; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB13_14 +; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: v_add_i32_e32 v8, vcc, 26, v8 +; CI-NEXT: v_sub_i32_e32 v11, vcc, v8, v9 +; CI-NEXT: .LBB13_13: ; %frem.loop_body23 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v9, v7 +; CI-NEXT: v_mov_b32_e32 v8, v6 +; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; CI-NEXT: v_rndne_f64_e32 v[6:7], v[6:7] +; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[2:3], v[8:9] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; CI-NEXT: v_add_f64 v[12:13], v[6:7], v[2:3] +; CI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc +; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; CI-NEXT: v_add_i32_e32 v11, vcc, 0xffffffe6, v11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v11 +; CI-NEXT: s_cbranch_vccnz .LBB13_13 +; CI-NEXT: s_branch .LBB13_15 +; CI-NEXT: .LBB13_14: +; CI-NEXT: v_mov_b32_e32 v9, v7 +; CI-NEXT: v_mov_b32_e32 v8, v6 +; CI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; CI-NEXT: v_add_i32_e32 v6, vcc, 0xffffffe7, v11 +; CI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_brev_b32 s3, 1 +; CI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3] +; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[4:5] +; CI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5] +; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[2:3], v[6:7] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CI-NEXT: v_ldexp_f64 v[2:3], v[2:3], v10 +; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; CI-NEXT: v_or_b32_e32 v2, s2, v2 +; CI-NEXT: v_or_b32_e32 v3, s3, v3 +; CI-NEXT: .LBB13_16: ; %Flow50 +; CI-NEXT: v_cmp_nlg_f64_e64 vcc, s[8:9], 0 +; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; CI-NEXT: v_mov_b32_e32 v5, 0x7ff00000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; CI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] -; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] -; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] -; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7] -; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] -; CI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7] +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; CI-NEXT: v_cmp_nge_f64_e64 vcc, |s[4:5]|, v[4:5] +; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; CI-NEXT: v_cmp_nlg_f64_e64 vcc, s[10:11], 0 +; CI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; CI-NEXT: v_cmp_nge_f64_e64 vcc, |s[6:7]|, v[4:5] +; CI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; @@ -1097,39 +3367,198 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5] -; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5] -; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5] -; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] -; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5] +; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]| +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_cbranch_vccz .LBB13_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]| +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_brev_b32 s3, 1 +; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB13_2: ; %Flow53 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB13_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]| +; VI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]| +; VI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]| +; VI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26 +; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[8:9]| +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v6 +; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v7 +; VI-NEXT: v_sub_u32_e32 v9, vcc, v2, v8 +; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1 +; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], 1.0 +; VI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0 +; VI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3] +; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; VI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15] +; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13] +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9 +; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB13_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v6, vcc, 26, v6 +; VI-NEXT: v_sub_u32_e32 v9, vcc, v6, v7 +; VI-NEXT: .LBB13_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3] +; VI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5] +; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; VI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1] +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0xffffffe6, v9 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9 +; VI-NEXT: s_cbranch_vccnz .LBB13_5 +; VI-NEXT: s_branch .LBB13_7 +; VI-NEXT: .LBB13_6: +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: .LBB13_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe7, v9 +; VI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_brev_b32 s3, 1 +; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; VI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3] +; VI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 +; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; VI-NEXT: v_or_b32_e32 v0, s2, v0 +; VI-NEXT: v_or_b32_e32 v1, s3, v1 +; VI-NEXT: .LBB13_8: ; %Flow54 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[6:7]|, |v[2:3]| +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: ; implicit-def: $vgpr2_vgpr3 +; VI-NEXT: s_cbranch_vccz .LBB13_10 +; VI-NEXT: ; %bb.9: ; %frem.else16 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7] -; VI-NEXT: v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7] -; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; VI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] -; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] -; VI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] -; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7] -; VI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] -; VI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7] +; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]| +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_brev_b32 s3, 1 +; VI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB13_10: ; %Flow49 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB13_16 +; VI-NEXT: ; %bb.11: ; %frem.compute15 +; VI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]| +; VI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]| +; VI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]| +; VI-NEXT: v_ldexp_f64 v[6:7], v[2:3], 26 +; VI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[10:11]| +; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v8 +; VI-NEXT: v_add_u32_e32 v10, vcc, -1, v9 +; VI-NEXT: v_sub_u32_e32 v11, vcc, v4, v10 +; VI-NEXT: v_ldexp_f64 v[2:3], v[2:3], 1 +; VI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], 1.0 +; VI-NEXT: v_div_scale_f64 v[16:17], vcc, 1.0, v[2:3], 1.0 +; VI-NEXT: v_rcp_f64_e32 v[12:13], v[4:5] +; VI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0 +; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; VI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0 +; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; VI-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] +; VI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[12:13], v[14:15] +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11 +; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB13_14 +; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: v_add_u32_e32 v8, vcc, 26, v8 +; VI-NEXT: v_sub_u32_e32 v11, vcc, v8, v9 +; VI-NEXT: .LBB13_13: ; %frem.loop_body23 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v9, v7 +; VI-NEXT: v_mov_b32_e32 v8, v6 +; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; VI-NEXT: v_rndne_f64_e32 v[6:7], v[6:7] +; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[2:3], v[8:9] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_add_f64 v[12:13], v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc +; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0xffffffe6, v11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v11 +; VI-NEXT: s_cbranch_vccnz .LBB13_13 +; VI-NEXT: s_branch .LBB13_15 +; VI-NEXT: .LBB13_14: +; VI-NEXT: v_mov_b32_e32 v9, v7 +; VI-NEXT: v_mov_b32_e32 v8, v6 +; VI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0xffffffe7, v11 +; VI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_brev_b32 s3, 1 +; VI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3] +; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[4:5] +; VI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5] +; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[2:3], v[6:7] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; VI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; VI-NEXT: v_ldexp_f64 v[2:3], v[2:3], v10 +; VI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; VI-NEXT: v_or_b32_e32 v2, s2, v2 +; VI-NEXT: v_or_b32_e32 v3, s3, v3 +; VI-NEXT: .LBB13_16: ; %Flow50 +; VI-NEXT: v_cmp_nlg_f64_e64 vcc, s[8:9], 0 +; VI-NEXT: v_mov_b32_e32 v4, 0 +; VI-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; VI-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-NEXT: v_cmp_nge_f64_e64 vcc, |s[4:5]|, v[4:5] +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; VI-NEXT: v_cmp_nlg_f64_e64 vcc, s[10:11], 0 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; VI-NEXT: v_cmp_nge_f64_e64 vcc, |s[6:7]|, v[4:5] ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index fc81e16d68e9..bd5303213a69 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -396,8 +396,7 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX6-NEXT: v_not_b32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7 ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 @@ -784,19 +783,17 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX6-LABEL: v_fshl_v2i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 7, v2 -; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v5, v0 ; GFX6-NEXT: v_bfe_u32 v5, v1, 1, 7 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v5 +; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v4 -; GFX6-NEXT: v_not_b32_e32 v4, v4 -; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8 -; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX6-NEXT: v_bfi_b32 v4, v4, 0, 7 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1 @@ -974,100 +971,98 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; ; GFX8-LABEL: s_fshl_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s6, s1, 8 -; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: s_lshr_b32 s8, s1, 24 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshr_b32 s9, s2, 8 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_lshr_b32 s11, s2, 24 -; GFX8-NEXT: s_and_b32 s12, s2, 7 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 -; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s11, s2, 7 ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, s12 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s9, 7 -; GFX8-NEXT: s_and_b32 s2, s6, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 -; GFX8-NEXT: s_andn2_b32 s3, 7, s9 -; GFX8-NEXT: s_lshr_b32 s2, s2, s3 -; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s10, 7 -; GFX8-NEXT: s_and_b32 s3, s7, 0xff -; GFX8-NEXT: s_lshl_b32 s2, s4, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s11 +; GFX8-NEXT: s_and_b32 s11, s1, 0xff +; GFX8-NEXT: s_lshr_b32 s8, s2, 8 +; GFX8-NEXT: s_lshr_b32 s9, s2, 16 +; GFX8-NEXT: s_lshr_b32 s10, s2, 24 +; GFX8-NEXT: s_lshr_b32 s11, s11, 1 +; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_lshr_b32 s2, s11, s2 +; GFX8-NEXT: s_lshr_b32 s6, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s8, 7 +; GFX8-NEXT: s_lshl_b32 s2, s3, s2 +; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 -; GFX8-NEXT: s_andn2_b32 s4, 7, s10 -; GFX8-NEXT: s_lshr_b32 s3, s3, s4 +; GFX8-NEXT: s_andn2_b32 s6, 7, s8 +; GFX8-NEXT: s_lshr_b32 s3, s3, s6 +; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s3, s11, 7 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s3, s5, s3 -; GFX8-NEXT: s_lshr_b32 s4, s8, 1 -; GFX8-NEXT: s_andn2_b32 s5, 7, s11 -; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_lshr_b32 s4, s4, s5 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, 0xff +; GFX8-NEXT: s_and_b32 s3, s9, 7 +; GFX8-NEXT: s_lshl_b32 s3, s4, s3 +; GFX8-NEXT: s_and_b32 s4, s7, 0xff +; GFX8-NEXT: s_lshr_b32 s4, s4, 1 +; GFX8-NEXT: s_andn2_b32 s6, 7, s9 +; GFX8-NEXT: s_lshr_b32 s4, s4, s6 ; GFX8-NEXT: s_or_b32 s3, s3, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, 0xff +; GFX8-NEXT: s_and_b32 s4, s10, 7 +; GFX8-NEXT: s_lshl_b32 s4, s5, s4 +; GFX8-NEXT: s_lshr_b32 s1, s1, 25 +; GFX8-NEXT: s_andn2_b32 s5, 7, s10 +; GFX8-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NEXT: s_lshr_b32 s1, s1, s5 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_or_b32 s1, s4, s1 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s3, 0xff +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s6, s1, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_lshr_b32 s8, s1, 24 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshr_b32 s9, s2, 8 -; GFX9-NEXT: s_lshr_b32 s10, s2, 16 -; GFX9-NEXT: s_lshr_b32 s11, s2, 24 -; GFX9-NEXT: s_and_b32 s12, s2, 7 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s11, s2, 7 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_lshl_b32 s0, s0, s12 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s9, 7 -; GFX9-NEXT: s_and_b32 s2, s6, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_lshr_b32 s2, s2, 1 -; GFX9-NEXT: s_andn2_b32 s3, 7, s9 -; GFX9-NEXT: s_lshr_b32 s2, s2, s3 -; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s10, 7 -; GFX9-NEXT: s_and_b32 s3, s7, 0xff -; GFX9-NEXT: s_lshl_b32 s2, s4, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s11 +; GFX9-NEXT: s_and_b32 s11, s1, 0xff +; GFX9-NEXT: s_lshr_b32 s8, s2, 8 +; GFX9-NEXT: s_lshr_b32 s9, s2, 16 +; GFX9-NEXT: s_lshr_b32 s10, s2, 24 +; GFX9-NEXT: s_lshr_b32 s11, s11, 1 +; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_lshr_b32 s2, s11, s2 +; GFX9-NEXT: s_lshr_b32 s6, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s8, 7 +; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_andn2_b32 s4, 7, s10 -; GFX9-NEXT: s_lshr_b32 s3, s3, s4 +; GFX9-NEXT: s_andn2_b32 s6, 7, s8 +; GFX9-NEXT: s_lshr_b32 s3, s3, s6 +; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_or_b32 s2, s2, s3 -; GFX9-NEXT: s_and_b32 s3, s11, 7 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s3, s5, s3 -; GFX9-NEXT: s_lshr_b32 s4, s8, 1 -; GFX9-NEXT: s_andn2_b32 s5, 7, s11 -; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_lshr_b32 s4, s4, s5 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, 0xff +; GFX9-NEXT: s_and_b32 s3, s9, 7 +; GFX9-NEXT: s_lshl_b32 s3, s4, s3 +; GFX9-NEXT: s_and_b32 s4, s7, 0xff +; GFX9-NEXT: s_lshr_b32 s4, s4, 1 +; GFX9-NEXT: s_andn2_b32 s6, 7, s9 +; GFX9-NEXT: s_lshr_b32 s4, s4, s6 ; GFX9-NEXT: s_or_b32 s3, s3, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, 0xff +; GFX9-NEXT: s_and_b32 s4, s10, 7 +; GFX9-NEXT: s_lshl_b32 s4, s5, s4 +; GFX9-NEXT: s_lshr_b32 s1, s1, 25 +; GFX9-NEXT: s_andn2_b32 s5, 7, s10 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: s_lshr_b32 s1, s1, s5 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s1, s4, s1 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s3, 0xff +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -1075,100 +1070,98 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX10-LABEL: s_fshl_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshr_b32 s9, s2, 8 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 +; GFX10-NEXT: s_and_b32 s11, s1, 0xff +; GFX10-NEXT: s_lshr_b32 s8, s2, 8 +; GFX10-NEXT: s_lshr_b32 s9, s2, 16 +; GFX10-NEXT: s_lshr_b32 s10, s2, 24 ; GFX10-NEXT: s_and_b32 s12, s2, 7 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_lshr_b32 s11, s11, 1 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s2, s6, 0xff -; GFX10-NEXT: s_and_b32 s6, s9, 7 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: s_andn2_b32 s9, 7, s9 +; GFX10-NEXT: s_lshr_b32 s2, s11, s2 +; GFX10-NEXT: s_and_b32 s11, s8, 7 +; GFX10-NEXT: s_lshr_b32 s6, s6, 1 +; GFX10-NEXT: s_andn2_b32 s8, 7, s8 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_lshr_b32 s7, s1, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, s12 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_lshr_b32 s2, s2, s9 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_or_b32 s1, s3, s2 -; GFX10-NEXT: s_and_b32 s2, s7, 0xff -; GFX10-NEXT: s_and_b32 s3, s10, 7 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: s_andn2_b32 s6, 7, s10 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10-NEXT: s_and_b32 s4, s11, 7 -; GFX10-NEXT: s_lshr_b32 s6, s8, 1 -; GFX10-NEXT: s_andn2_b32 s7, 7, s11 -; GFX10-NEXT: s_lshl_b32 s4, s5, s4 -; GFX10-NEXT: s_lshr_b32 s5, s6, s7 -; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_or_b32 s3, s4, s5 -; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, s11 +; GFX10-NEXT: s_lshr_b32 s6, s6, s8 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s2, s3, s6 +; GFX10-NEXT: s_and_b32 s3, s7, 0xff +; GFX10-NEXT: s_and_b32 s6, s9, 7 +; GFX10-NEXT: s_lshr_b32 s3, s3, 1 +; GFX10-NEXT: s_andn2_b32 s7, 7, s9 +; GFX10-NEXT: s_lshl_b32 s4, s4, s6 +; GFX10-NEXT: s_lshr_b32 s3, s3, s7 +; GFX10-NEXT: s_and_b32 s6, s10, 7 +; GFX10-NEXT: s_lshr_b32 s1, s1, 25 +; GFX10-NEXT: s_andn2_b32 s7, 7, s10 +; GFX10-NEXT: s_lshl_b32 s5, s5, s6 +; GFX10-NEXT: s_lshr_b32 s1, s1, s7 +; GFX10-NEXT: s_or_b32 s3, s4, s3 ; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, 0xff -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 24 +; GFX10-NEXT: s_or_b32 s1, s5, s1 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_and_b32 s3, s3, 0xff +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s2, s3, 16 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_v4i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: s_lshr_b32 s7, s1, 16 -; GFX11-NEXT: s_lshr_b32 s8, s1, 24 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshr_b32 s9, s2, 8 -; GFX11-NEXT: s_lshr_b32 s10, s2, 16 -; GFX11-NEXT: s_lshr_b32 s11, s2, 24 +; GFX11-NEXT: s_and_b32 s11, s1, 0xff +; GFX11-NEXT: s_lshr_b32 s8, s2, 8 +; GFX11-NEXT: s_lshr_b32 s9, s2, 16 +; GFX11-NEXT: s_lshr_b32 s10, s2, 24 ; GFX11-NEXT: s_and_b32 s12, s2, 7 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_lshr_b32 s11, s11, 1 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s1, s1, s2 -; GFX11-NEXT: s_and_b32 s2, s6, 0xff -; GFX11-NEXT: s_and_b32 s6, s9, 7 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 -; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 +; GFX11-NEXT: s_lshr_b32 s2, s11, s2 +; GFX11-NEXT: s_and_b32 s11, s8, 7 +; GFX11-NEXT: s_lshr_b32 s6, s6, 1 +; GFX11-NEXT: s_and_not1_b32 s8, 7, s8 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s5, s0, 24 +; GFX11-NEXT: s_lshr_b32 s7, s1, 16 ; GFX11-NEXT: s_lshl_b32 s0, s0, s12 -; GFX11-NEXT: s_lshl_b32 s3, s3, s6 -; GFX11-NEXT: s_lshr_b32 s2, s2, s9 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s3, s2 -; GFX11-NEXT: s_and_b32 s2, s7, 0xff -; GFX11-NEXT: s_and_b32 s3, s10, 7 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 -; GFX11-NEXT: s_and_not1_b32 s6, 7, s10 -; GFX11-NEXT: s_lshl_b32 s3, s4, s3 -; GFX11-NEXT: s_lshr_b32 s2, s2, s6 -; GFX11-NEXT: s_and_b32 s4, s11, 7 -; GFX11-NEXT: s_lshr_b32 s6, s8, 1 -; GFX11-NEXT: s_and_not1_b32 s7, 7, s11 -; GFX11-NEXT: s_lshl_b32 s4, s5, s4 -; GFX11-NEXT: s_lshr_b32 s5, s6, s7 -; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_or_b32 s3, s4, s5 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, s11 +; GFX11-NEXT: s_lshr_b32 s6, s6, s8 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s6 +; GFX11-NEXT: s_and_b32 s3, s7, 0xff +; GFX11-NEXT: s_and_b32 s6, s9, 7 +; GFX11-NEXT: s_lshr_b32 s3, s3, 1 +; GFX11-NEXT: s_and_not1_b32 s7, 7, s9 +; GFX11-NEXT: s_lshl_b32 s4, s4, s6 +; GFX11-NEXT: s_lshr_b32 s3, s3, s7 +; GFX11-NEXT: s_and_b32 s6, s10, 7 +; GFX11-NEXT: s_lshr_b32 s1, s1, 25 +; GFX11-NEXT: s_and_not1_b32 s7, 7, s10 +; GFX11-NEXT: s_lshl_b32 s5, s5, s6 +; GFX11-NEXT: s_lshr_b32 s1, s1, s7 +; GFX11-NEXT: s_or_b32 s3, s4, s3 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s1, s2, 16 -; GFX11-NEXT: s_and_b32 s2, s3, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s1, s2, 24 +; GFX11-NEXT: s_or_b32 s1, s5, s1 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s3, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_lshl_b32 s1, s1, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -1184,38 +1177,34 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX6-LABEL: v_fshl_v4i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v9, 7, v2 -; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 ; GFX6-NEXT: v_bfe_u32 v9, v1, 1, 7 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v9 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v6 -; GFX6-NEXT: v_not_b32_e32 v6, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_bfe_u32 v3, v1, 8, 8 -; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX6-NEXT: v_bfi_b32 v6, v6, 0, 7 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 7, v7 -; GFX6-NEXT: v_not_b32_e32 v6, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_bfe_u32 v4, v1, 16, 8 -; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX6-NEXT: v_bfi_b32 v6, v7, 0, 7 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v8 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v8 -; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX6-NEXT: v_bfi_b32 v6, v8, 0, 7 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 25, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 @@ -1255,18 +1244,18 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX8-NEXT: v_mov_b32_e32 v7, 0xff ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 7 -; GFX8-NEXT: v_mov_b32_e32 v9, -1 +; GFX8-NEXT: v_mov_b32_e32 v8, -1 ; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_xor_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v9, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7 -; GFX8-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 25, v1 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, v10, v7 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v9, v7 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 @@ -1305,21 +1294,21 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX9-NEXT: v_mov_b32_e32 v7, 0xff ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, 7 -; GFX9-NEXT: v_mov_b32_e32 v10, -1 +; GFX9-NEXT: v_mov_b32_e32 v9, -1 ; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_xor_b32_sdwa v11, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_xor_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_xor_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b16_e32 v9, 1, v9 -; GFX9-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_xor_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_e32 v8, 1, v8 +; GFX9-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 25, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b16_e32 v9, v11, v9 +; GFX9-NEXT: v_lshrrev_b16_e32 v8, v10, v8 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 -; GFX9-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v8 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1334,111 +1323,109 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-LABEL: v_fshl_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v9, 0xff, v1 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_lshrrev_b16 v8, 1, v9 -; GFX10-NEXT: v_and_b32_e32 v9, 7, v10 -; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0 +; GFX10-NEXT: v_lshrrev_b16 v7, 1, v8 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v9 +; GFX10-NEXT: v_lshlrev_b16 v3, v10, v3 +; GFX10-NEXT: v_and_b32_e32 v9, 0xff, v11 ; GFX10-NEXT: v_mov_b32_e32 v10, 0xff ; GFX10-NEXT: v_mov_b32_e32 v11, -1 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX10-NEXT: v_mov_b32_e32 v13, 7 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_xor_b32_sdwa v10, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX10-NEXT: v_mov_b32_e32 v12, 7 +; GFX10-NEXT: v_lshrrev_b16 v9, 1, v9 +; GFX10-NEXT: v_and_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_sdwa v13, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_xor_b32_sdwa v11, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX10-NEXT: v_and_b32_sdwa v14, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12 +; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX10-NEXT: v_and_b32_sdwa v14, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16 v10, 1, v10 +; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 25, v1 ; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7 +; GFX10-NEXT: v_lshrrev_b16 v5, v5, v9 ; GFX10-NEXT: v_lshlrev_b16 v4, v14, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 -; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 -; GFX10-NEXT: v_lshrrev_b16 v5, v11, v12 -; GFX10-NEXT: v_lshrrev_b16 v7, v9, v8 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, 8 -; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b16 v9, v13, v10 +; GFX10-NEXT: v_lshlrev_b16 v2, v2, v6 +; GFX10-NEXT: v_lshrrev_b16 v1, v11, v1 +; GFX10-NEXT: v_lshrrev_b16 v6, v8, v7 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, 8 +; GFX10-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX10-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_v4i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_xor_b32_e32 v13, -1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX11-NEXT: v_xor_b32_e32 v13, -1, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX11-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-NEXT: v_lshrrev_b16 v6, 1, v6 ; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b16 v3, v9, v3 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-NEXT: v_lshrrev_b16 v6, v13, v6 -; GFX11-NEXT: v_xor_b32_e32 v13, -1, v11 +; GFX11-NEXT: v_lshlrev_b16 v3, v8, v3 +; GFX11-NEXT: v_xor_b32_e32 v8, -1, v9 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX11-NEXT: v_and_b32_e32 v12, 7, v2 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshrrev_b16 v6, v13, v6 +; GFX11-NEXT: v_xor_b32_e32 v13, -1, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 7, v2 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v1 ; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX11-NEXT: v_lshrrev_b16 v7, 1, v7 ; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 -; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX11-NEXT: v_lshrrev_b16 v8, 1, v8 +; GFX11-NEXT: v_lshrrev_b16 v7, 1, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 25, v1 ; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 -; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-NEXT: v_lshrrev_b16 v12, 1, v12 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX11-NEXT: v_lshlrev_b16 v4, v10, v4 -; GFX11-NEXT: v_lshrrev_b16 v6, v9, v7 -; GFX11-NEXT: v_lshlrev_b16 v5, v11, v5 -; GFX11-NEXT: v_lshrrev_b16 v7, v13, v8 -; GFX11-NEXT: v_lshlrev_b16 v0, v12, v0 -; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v6 -; GFX11-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX11-NEXT: v_lshlrev_b16 v4, v9, v4 +; GFX11-NEXT: v_lshrrev_b16 v6, v8, v7 +; GFX11-NEXT: v_lshlrev_b16 v5, v10, v5 +; GFX11-NEXT: v_lshrrev_b16 v1, v13, v1 +; GFX11-NEXT: v_lshlrev_b16 v0, v11, v0 +; GFX11-NEXT: v_lshrrev_b16 v2, v2, v12 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX11-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -3686,22 +3673,21 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; ; GFX8-LABEL: s_fshl_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshr_b32 s5, s2, 16 -; GFX8-NEXT: s_and_b32 s6, s2, 15 -; GFX8-NEXT: s_andn2_b32 s2, 15, s2 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s5, s2, 15 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s6 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s5, 15 -; GFX8-NEXT: s_andn2_b32 s2, 15, s5 -; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_lshr_b32 s3, s4, 1 -; GFX8-NEXT: s_lshr_b32 s2, s3, s2 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s5 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s1 +; GFX8-NEXT: s_lshr_b32 s4, s2, 16 +; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: s_lshr_b32 s5, s5, 1 +; GFX8-NEXT: s_lshr_b32 s2, s5, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s4, 15 +; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_lshr_b32 s1, s1, 17 +; GFX8-NEXT: s_lshl_b32 s2, s3, s2 +; GFX8-NEXT: s_lshr_b32 s1, s1, s4 +; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -3813,13 +3799,12 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v5 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 15 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v5, -1 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v4, 1 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 17, v1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -3886,14 +3871,12 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 12, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 12, v1 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 4, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3964,11 +3947,10 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, -1 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshr_b32 s0, s3, 1 +; GFX8-NEXT: s_lshr_b32 s0, s1, 17 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 @@ -4058,11 +4040,10 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, < ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, s1, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX8-NEXT: s_and_b32 s0, s3, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s3 -; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 17, v0 ; GFX8-NEXT: s_lshl_b32 s0, s2, s0 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 @@ -4142,21 +4123,20 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, < ; ; GFX8-LABEL: v_fshl_v2i16_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s4, s1, 15 +; GFX8-NEXT: s_and_b32 s3, s1, 15 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, s3, v0 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s2, s1, 16 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v0 -; GFX8-NEXT: s_lshr_b32 s0, s0, s1 -; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX8-NEXT: s_and_b32 s0, s3, 15 -; GFX8-NEXT: s_andn2_b32 s1, 15, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s1, s3, s1 +; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX8-NEXT: s_and_b32 s1, s2, 15 +; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: s_lshr_b32 s0, s0, 17 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s2 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -4256,23 +4236,22 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, < ; ; GFX8-LABEL: s_fshl_v3i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_lshr_b32 s8, s4, 16 -; GFX8-NEXT: s_and_b32 s9, s4, 15 -; GFX8-NEXT: s_andn2_b32 s4, 15, s4 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s8, s4, 15 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s9 -; GFX8-NEXT: s_lshr_b32 s2, s2, s4 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s8, 15 -; GFX8-NEXT: s_andn2_b32 s4, 15, s8 -; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_lshr_b32 s6, s7, 1 -; GFX8-NEXT: s_lshr_b32 s4, s6, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s2 +; GFX8-NEXT: s_lshr_b32 s7, s4, 16 +; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_lshr_b32 s8, s8, 1 +; GFX8-NEXT: s_lshr_b32 s4, s8, s4 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, s7, 15 +; GFX8-NEXT: s_andn2_b32 s7, 15, s7 +; GFX8-NEXT: s_lshr_b32 s2, s2, 17 +; GFX8-NEXT: s_lshl_b32 s4, s6, s4 +; GFX8-NEXT: s_lshr_b32 s2, s2, s7 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_and_b32 s4, s5, 15 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 @@ -4469,13 +4448,12 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX8-NEXT: v_mov_b32_e32 v7, 15 -; GFX8-NEXT: v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v8, -1 +; GFX8-NEXT: v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v7, 1 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 17, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 @@ -4593,39 +4571,37 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; ; GFX8-LABEL: s_fshl_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s8, s2, 16 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_lshr_b32 s10, s4, 16 -; GFX8-NEXT: s_and_b32 s12, s4, 15 -; GFX8-NEXT: s_andn2_b32 s4, 15, s4 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s10, s4, 15 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s12 -; GFX8-NEXT: s_lshr_b32 s2, s2, s4 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s10, 15 -; GFX8-NEXT: s_andn2_b32 s4, 15, s10 -; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_lshr_b32 s6, s8, 1 -; GFX8-NEXT: s_lshr_b32 s9, s3, 16 -; GFX8-NEXT: s_lshr_b32 s4, s6, s4 -; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_lshr_b32 s11, s5, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s10 +; GFX8-NEXT: s_and_b32 s10, 0xffff, s2 +; GFX8-NEXT: s_lshr_b32 s8, s4, 16 +; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_lshr_b32 s10, s10, 1 +; GFX8-NEXT: s_lshr_b32 s4, s10, s4 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, s8, 15 +; GFX8-NEXT: s_andn2_b32 s8, 15, s8 +; GFX8-NEXT: s_lshr_b32 s2, s2, 17 +; GFX8-NEXT: s_lshl_b32 s4, s6, s4 +; GFX8-NEXT: s_lshr_b32 s2, s2, s8 +; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_and_b32 s4, s5, 15 -; GFX8-NEXT: s_andn2_b32 s5, 15, s5 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s3 +; GFX8-NEXT: s_lshr_b32 s9, s5, 16 +; GFX8-NEXT: s_andn2_b32 s5, 15, s5 +; GFX8-NEXT: s_lshr_b32 s4, s4, 1 +; GFX8-NEXT: s_lshr_b32 s4, s4, s5 +; GFX8-NEXT: s_or_b32 s1, s1, s4 +; GFX8-NEXT: s_and_b32 s4, s9, 15 +; GFX8-NEXT: s_andn2_b32 s5, 15, s9 +; GFX8-NEXT: s_lshr_b32 s3, s3, 17 +; GFX8-NEXT: s_lshl_b32 s4, s7, s4 ; GFX8-NEXT: s_lshr_b32 s3, s3, s5 -; GFX8-NEXT: s_or_b32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s3, s11, 15 -; GFX8-NEXT: s_andn2_b32 s4, 15, s11 -; GFX8-NEXT: s_lshr_b32 s5, s9, 1 -; GFX8-NEXT: s_lshl_b32 s3, s7, s3 -; GFX8-NEXT: s_lshr_b32 s4, s5, s4 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 @@ -4810,26 +4786,25 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX8-NEXT: v_mov_b32_e32 v7, 15 -; GFX8-NEXT: v_and_b32_sdwa v8, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v9, -1 +; GFX8-NEXT: v_and_b32_sdwa v8, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v8, 1 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 17, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v10, 1, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v10 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX8-NEXT: v_and_b32_sdwa v4, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 17, v3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 @@ -5023,10 +4998,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX6-LABEL: v_fshl_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 63, v4 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_not_b32_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX6-NEXT: v_bfi_b32 v4, v4, 0, 63 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -5036,10 +5010,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX8-LABEL: v_fshl_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v5, 63, v4 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_not_b32_e32 v4, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX8-NEXT: v_bfi_b32 v4, v4, 0, 63 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 @@ -5049,10 +5022,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX9-LABEL: v_fshl_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_not_b32_e32 v4, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX9-NEXT: v_bfi_b32 v4, v4, 0, 63 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 @@ -5062,12 +5034,11 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX10-LABEL: v_fshl_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v5, v4 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX10-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX10-NEXT: v_bfi_b32 v4, v4, 0, 63 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5075,16 +5046,14 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX11-LABEL: v_fshl_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v5, v4 ; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX11-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v5, 63, v5 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX11-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX11-NEXT: v_bfi_b32 v4, v4, 0, 63 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) @@ -5204,10 +5173,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; GFX6-LABEL: v_fshl_i64_ssv: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_and_b32_e32 v1, 63, v0 -; GFX6-NEXT: v_not_b32_e32 v0, v0 ; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v1 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 -; GFX6-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX6-NEXT: v_bfi_b32 v0, v0, 0, 63 ; GFX6-NEXT: v_lshr_b64 v[3:4], s[0:1], v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v4 @@ -5216,10 +5184,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; GFX8-LABEL: v_fshl_i64_ssv: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v1, 63, v0 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 -; GFX8-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, 63 ; GFX8-NEXT: v_lshrrev_b64 v[3:4], v0, s[0:1] ; GFX8-NEXT: v_or_b32_e32 v0, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v4 @@ -5228,10 +5195,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; GFX9-LABEL: v_fshl_i64_ssv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v1, 63, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 -; GFX9-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX9-NEXT: v_bfi_b32 v0, v0, 0, 63 ; GFX9-NEXT: v_lshrrev_b64 v[3:4], v0, s[0:1] ; GFX9-NEXT: v_or_b32_e32 v0, v1, v3 ; GFX9-NEXT: v_or_b32_e32 v1, v2, v4 @@ -5239,11 +5205,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; ; GFX10-LABEL: v_fshl_i64_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_not_b32_e32 v1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 63, v0 +; GFX10-NEXT: v_bfi_b32 v2, v0, 0, 63 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: v_and_b32_e32 v2, 63, v1 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 @@ -5251,16 +5216,14 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; ; GFX11-LABEL: v_fshl_i64_ssv: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_not_b32_e32 v1, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 63, v0 +; GFX11-NEXT: v_bfi_b32 v2, v0, 0, 63 ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v2, 63, v1 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) @@ -5466,18 +5429,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX6-LABEL: v_fshl_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v9, 63, v8 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], 1 -; GFX6-NEXT: v_not_b32_e32 v8, v8 -; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX6-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX6-NEXT: v_bfi_b32 v8, v8, 0, 63 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], 1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v10 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 -; GFX6-NEXT: v_not_b32_e32 v4, v10 -; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX6-NEXT: v_bfi_b32 v4, v10, 0, 63 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5487,18 +5448,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX8-LABEL: v_fshl_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v9, 63, v8 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX8-NEXT: v_not_b32_e32 v8, v8 -; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX8-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX8-NEXT: v_bfi_b32 v8, v8, 0, 63 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 63, v10 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] -; GFX8-NEXT: v_not_b32_e32 v4, v10 -; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX8-NEXT: v_bfi_b32 v4, v10, 0, 63 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5508,18 +5467,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX9-LABEL: v_fshl_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX9-NEXT: v_not_b32_e32 v8, v8 -; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX9-NEXT: v_bfi_b32 v8, v8, 0, 63 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 63, v10 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] -; GFX9-NEXT: v_not_b32_e32 v4, v10 -; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX9-NEXT: v_bfi_b32 v4, v10, 0, 63 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5529,18 +5486,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX10-LABEL: v_fshl_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v9, v8 -; GFX10-NEXT: v_not_b32_e32 v11, v10 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX10-NEXT: v_and_b32_e32 v9, 63, v9 -; GFX10-NEXT: v_and_b32_e32 v10, 63, v10 -; GFX10-NEXT: v_and_b32_e32 v11, 63, v11 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3] -; GFX10-NEXT: v_lshrrev_b64 v[6:7], v11, v[6:7] +; GFX10-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX10-NEXT: v_bfi_b32 v8, v8, 0, 63 +; GFX10-NEXT: v_and_b32_e32 v11, 63, v10 +; GFX10-NEXT: v_bfi_b32 v10, v10, 0, 63 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5550,20 +5505,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX11-LABEL: v_fshl_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v9, v8 -; GFX11-NEXT: v_not_b32_e32 v11, v10 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX11-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX11-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX11-NEXT: v_and_b32_e32 v9, 63, v9 -; GFX11-NEXT: v_and_b32_e32 v10, 63, v10 -; GFX11-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX11-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX11-NEXT: v_bfi_b32 v8, v8, 0, 63 +; GFX11-NEXT: v_and_b32_e32 v11, 63, v10 +; GFX11-NEXT: v_bfi_b32 v10, v10, 0, 63 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] -; GFX11-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3] -; GFX11-NEXT: v_lshrrev_b64 v[6:7], v11, v[6:7] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] +; GFX11-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v5 @@ -5818,32 +5771,32 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-LABEL: v_fshl_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 64, v15 -; GFX6-NEXT: v_add_i32_e32 v17, vcc, 0xffffffc0, v15 +; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v8 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 64, v16 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, 0xffffffc0, v16 ; GFX6-NEXT: v_lshr_b64 v[9:10], v[0:1], v9 -; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v15 -; GFX6-NEXT: v_lshl_b64 v[13:14], v[0:1], v15 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v17 +; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v16 +; GFX6-NEXT: v_lshl_b64 v[13:14], v[0:1], v16 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v18 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX6-NEXT: v_or_b32_e32 v10, v10, v12 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; GFX6-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc ; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], 1 +; GFX6-NEXT: v_mov_b32_e32 v15, 0x7f ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v6 -; GFX6-NEXT: v_not_b32_e32 v4, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[6:7], 1 -; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v4 -; GFX6-NEXT: v_not_b32_e32 v16, 63 +; GFX6-NEXT: v_bfi_b32 v14, v8, 0, v15 +; GFX6-NEXT: v_not_b32_e32 v17, 63 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v14 -; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v16 +; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v17 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v14 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v6 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v14 @@ -5867,32 +5820,32 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX8-LABEL: v_fshl_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 64, v15 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffffc0, v15 +; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v8 +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 64, v16 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xffffffc0, v16 ; GFX8-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v17, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[11:12], v16, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[13:14], v16, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v18, v[0:1] ; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX8-NEXT: v_or_b32_e32 v10, v10, v12 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v15, 0x7f ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v6 -; GFX8-NEXT: v_not_b32_e32 v4, v8 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7] -; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v4 -; GFX8-NEXT: v_not_b32_e32 v16, 63 +; GFX8-NEXT: v_bfi_b32 v14, v8, 0, v15 +; GFX8-NEXT: v_not_b32_e32 v17, 63 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v14 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v16 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v17 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[2:3] @@ -5916,27 +5869,27 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX9-LABEL: v_fshl_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX9-NEXT: v_sub_u32_e32 v9, 64, v15 -; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v15 +; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v8 +; GFX9-NEXT: v_sub_u32_e32 v9, 64, v16 +; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16 ; GFX9-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[11:12], v16, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[13:14], v16, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v17, v[0:1] ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX9-NEXT: v_or_b32_e32 v10, v10, v12 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v15, 0x7f ; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5] -; GFX9-NEXT: v_not_b32_e32 v4, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v9, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7] -; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v4 +; GFX9-NEXT: v_bfi_b32 v14, v8, 0, v15 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 31, v1 ; GFX9-NEXT: v_sub_u32_e32 v6, 64, v14 ; GFX9-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14 @@ -5963,99 +5916,96 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-LABEL: v_fshl_i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v8 -; GFX10-NEXT: v_not_b32_e32 v12, v8 +; GFX10-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v12 -; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] +; GFX10-NEXT: v_bfi_b32 v18, v8, 0, 0x7f +; GFX10-NEXT: v_lshrrev_b64 v[9:10], 1, v[6:7] +; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v17 +; GFX10-NEXT: v_lshlrev_b64 v[7:8], v17, v[2:3] +; GFX10-NEXT: v_add_nc_u32_e32 v15, 0xffffffc0, v17 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 31, v5 -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX10-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 -; GFX10-NEXT: v_or_b32_e32 v8, v10, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19 -; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] -; GFX10-NEXT: v_or_b32_e32 v11, v11, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7] -; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v19 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v18 -; GFX10-NEXT: v_or_b32_e32 v0, v14, v16 -; GFX10-NEXT: v_or_b32_e32 v10, v15, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s5 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v18 +; GFX10-NEXT: v_lshrrev_b64 v[11:12], v11, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[13:14], v17, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v15, v[0:1] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v17 +; GFX10-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v18 +; GFX10-NEXT: v_lshlrev_b64 v[15:16], v16, v[9:10] +; GFX10-NEXT: v_or_b32_e32 v11, v11, v7 +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v18, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v18 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v20, v0, v11, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[11:12], v19, v[9:10] +; GFX10-NEXT: v_or_b32_e32 v0, v6, v15 +; GFX10-NEXT: v_or_b32_e32 v6, v7, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v8, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 +; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v0, s5 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v18, v[9:10] +; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v6, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v20, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s5 -; GFX10-NEXT: v_or_b32_e32 v0, v12, v4 -; GFX10-NEXT: v_or_b32_e32 v1, v7, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v1, s5 +; GFX10-NEXT: v_or_b32_e32 v0, v13, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v9, v5 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v8 -; GFX11-NEXT: v_not_b32_e32 v12, v8 +; GFX11-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v12 -; GFX11-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18 -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] -; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5 -; GFX11-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1] -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v19 -; GFX11-NEXT: v_or_b32_e32 v8, v10, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19 -; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] -; GFX11-NEXT: v_or_b32_e32 v11, v11, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7] -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v0, v14, v16 -; GFX11-NEXT: v_or_b32_e32 v10, v15, v17 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v18 +; GFX11-NEXT: v_bfi_b32 v18, v8, 0, 0x7f +; GFX11-NEXT: v_lshrrev_b64 v[9:10], 1, v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v0, s1 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v10, s1 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo +; GFX11-NEXT: v_sub_nc_u32_e32 v11, 64, v17 +; GFX11-NEXT: v_lshlrev_b64 v[7:8], v17, v[2:3] +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0xffffffc0, v17 +; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v18 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], v11, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[13:14], v17, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v15, v[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v18 +; GFX11-NEXT: v_lshlrev_b64 v[15:16], v16, v[9:10] +; GFX11-NEXT: v_or_b32_e32 v11, v11, v7 +; GFX11-NEXT: v_lshrrev_b64 v[6:7], v18, v[4:5] +; GFX11-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v18 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v0, v11, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[11:12], v19, v[9:10] +; GFX11-NEXT: v_or_b32_e32 v0, v6, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v8, v11, v0, s1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v18, v[9:10] +; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s1 +; GFX11-NEXT: v_cndmask_b32_e32 v9, 0, v14, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v20, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, v1, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v0, v12, v4 -; GFX11-NEXT: v_or_b32_e32 v1, v7, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v13, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v9, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) ret i128 %result @@ -6064,264 +6014,260 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { ; GFX6-LABEL: v_fshl_i128_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7 +; GFX6-NEXT: v_and_b32_e32 v8, 0x7f, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v8 ; GFX6-NEXT: v_lshr_b64 v[1:2], s[0:1], v1 -; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v7 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0xffffffc0, v7 -; GFX6-NEXT: v_lshl_b64 v[5:6], s[0:1], v7 +; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v8 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, 0xffffffc0, v8 +; GFX6-NEXT: v_lshl_b64 v[5:6], s[0:1], v8 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v4, v2, v4 -; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v9 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX6-NEXT: v_not_b32_e32 v0, v0 +; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v10 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x7f +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 ; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_mov_b32_e32 v3, s2 ; GFX6-NEXT: v_mov_b32_e32 v4, s3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX6-NEXT: s_lshl_b32 s9, s6, 31 -; GFX6-NEXT: v_and_b32_e32 v11, 0x7f, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc +; GFX6-NEXT: v_bfi_b32 v7, v0, 0, v7 +; GFX6-NEXT: v_cndmask_b32_e32 v8, v1, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, v2, v4, vcc ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v11 -; GFX6-NEXT: v_not_b32_e32 v8, 63 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v11 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v7 +; GFX6-NEXT: v_not_b32_e32 v9, 63 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v7 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v2 -; GFX6-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v7, v9 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v8 -; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v11 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v9 +; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v7 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v11, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_i128_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7 +; GFX8-NEXT: v_and_b32_e32 v8, 0x7f, v0 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v8 ; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3] -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffffc0, v7 -; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v8, s[2:3] +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffffc0, v8 +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v8, s[0:1] ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v4, v2, v4 -; GFX8-NEXT: v_lshlrev_b64 v[1:2], v9, s[0:1] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX8-NEXT: v_not_b32_e32 v0, v0 +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v10, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v7, 0x7f +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 ; GFX8-NEXT: s_mov_b32 s8, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX8-NEXT: s_lshl_b32 s9, s6, 31 -; GFX8-NEXT: v_and_b32_e32 v11, 0x7f, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc +; GFX8-NEXT: v_bfi_b32 v7, v0, 0, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, v2, v4, vcc ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v11 -; GFX8-NEXT: v_not_b32_e32 v8, 63 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[0:1] +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v7 +; GFX8-NEXT: v_not_b32_e32 v9, 63 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v11, v8 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v7, v9 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v8, s[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v11, s[2:3] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v9, s[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v11, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i128_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0x7f, v0 +; GFX9-NEXT: v_sub_u32_e32 v1, 64, v8 ; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3] -; GFX9-NEXT: v_add_u32_e32 v8, 0xffffffc0, v7 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[3:4], v8, s[2:3] +; GFX9-NEXT: v_add_u32_e32 v9, 0xffffffc0, v8 +; GFX9-NEXT: v_lshlrev_b64 v[5:6], v8, s[0:1] ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_or_b32_e32 v4, v2, v4 -; GFX9-NEXT: v_lshlrev_b64 v[1:2], v8, s[0:1] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX9-NEXT: v_not_b32_e32 v0, v0 +; GFX9-NEXT: v_lshlrev_b64 v[1:2], v9, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v7, 0x7f +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 ; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX9-NEXT: s_lshl_b32 s9, s6, 31 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX9-NEXT: v_bfi_b32 v7, v0, 0, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX9-NEXT: v_sub_u32_e32 v2, 64, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v10, s[0:1] +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v10 +; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v7 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, s[2:3] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX9-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX9-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX9-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i128_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0 -; GFX10-NEXT: v_not_b32_e32 v6, v0 +; GFX10-NEXT: v_and_b32_e32 v11, 0x7f, v0 +; GFX10-NEXT: v_bfi_b32 v12, v0, 0, 0x7f ; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX10-NEXT: s_lshl_b32 s9, s6, 31 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] -; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v6 -; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v12 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 64, v11 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v11 +; GFX10-NEXT: v_lshlrev_b64 v[1:2], v11, s[2:3] +; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v12 ; GFX10-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[3:4], v3, s[0:1] ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], v7, s[0:1] -; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v13, s[8:9] -; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v13 -; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1] -; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v13 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v2, s[6:7] -; GFX10-NEXT: v_or_b32_e32 v2, v3, v1 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v3, v8, v10 -; GFX10-NEXT: v_or_b32_e32 v8, v9, v11 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1 +; GFX10-NEXT: v_lshlrev_b64 v[7:8], v7, s[0:1] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11 +; GFX10-NEXT: v_add_nc_u32_e32 v13, 0xffffffc0, v12 +; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7] +; GFX10-NEXT: v_or_b32_e32 v3, v3, v1 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v12, s[8:9] +; GFX10-NEXT: v_or_b32_e32 v4, v4, v2 +; GFX10-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1] +; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s9, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1 -; GFX10-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX10-NEXT: v_or_b32_e32 v9, v1, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s9, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1 +; GFX10-NEXT: v_or_b32_e32 v0, v5, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v6, v3 +; GFX10-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX10-NEXT: v_or_b32_e32 v3, v4, v9 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshl_i128_ssv: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v0 -; GFX11-NEXT: v_not_b32_e32 v6, v0 -; GFX11-NEXT: s_lshl_b32 s9, s6, 31 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; GFX11-NEXT: v_and_b32_e32 v11, 0x7f, v0 +; GFX11-NEXT: v_bfi_b32 v12, v0, 0, 0x7f ; GFX11-NEXT: s_mov_b32 s8, 0 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 -; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v12 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] ; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_lshl_b32 s9, s6, 31 +; GFX11-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11 +; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v12 ; GFX11-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v7, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v13 -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v2, s[6:7] -; GFX11-NEXT: v_or_b32_e32 v2, v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1] -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v13, s[8:9] -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v13 -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v13 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v6 :: v_dual_add_nc_u32 v13, 0xffffffc0, v12 +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 64, v11 +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v11, s[2:3] +; GFX11-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7] +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 ; GFX11-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s2, s0 -; GFX11-NEXT: v_or_b32_e32 v3, v8, v10 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v11 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s3, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], v3, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v4, v4, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v11 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v12, s[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b64 v[7:8], v7, s[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v11 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX11-NEXT: v_or_b32_e32 v9, v1, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc_lo ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s8, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s9, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1 -; GFX11-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s2, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v9, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s3, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s8, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s9, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX11-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v5, v2 +; GFX11-NEXT: v_or_b32_e32 v1, v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v9 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -7445,185 +7391,183 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-LABEL: v_fshl_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 -; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[8:9], 1 -; GFX6-NEXT: v_not_b32_e32 v16, v16 -; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX6-NEXT: v_lshlrev_b32_e32 v17, 31, v10 -; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], 1 -; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v17 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24 -; GFX6-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX6-NEXT: v_not_b32_e32 v18, 63 +; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 64, v19 +; GFX6-NEXT: v_add_i32_e32 v27, vcc, v19, v18 +; GFX6-NEXT: v_lshr_b64 v[23:24], v[0:1], v23 +; GFX6-NEXT: v_lshl_b64 v[25:26], v[2:3], v19 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], v19 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v27 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX6-NEXT: v_or_b32_e32 v19, v23, v25 +; GFX6-NEXT: v_or_b32_e32 v23, v24, v26 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v23, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5] +; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], 1 +; GFX6-NEXT: v_mov_b32_e32 v17, 0x7f +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v10 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1 +; GFX6-NEXT: v_bfi_b32 v10, v16, 0, v17 +; GFX6-NEXT: v_cndmask_b32_e32 v24, 0, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v25, 0, v22, vcc +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v10, v18 +; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 64, v10 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v10 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[0:1], v10 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v21 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v16 +; GFX6-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX6-NEXT: v_or_b32_e32 v11, v11, v22 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v20 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v16 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[4:5], v10 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[6:7], v16 +; GFX6-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v16, v18 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v16 +; GFX6-NEXT: v_or_b32_e32 v16, v10, v21 +; GFX6-NEXT: v_or_b32_e32 v21, v11, v22 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[4:5], v19 +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v8, v10, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, v11, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] +; GFX6-NEXT: v_lshr_b64 v[8:9], v[12:13], 1 +; GFX6-NEXT: v_lshlrev_b32_e32 v10, 31, v14 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[14:15], 1 +; GFX6-NEXT: v_bfi_b32 v14, v20, 0, v17 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v14, v18 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v14 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GFX6-NEXT: v_lshr_b64 v[12:13], v[10:11], v14 +; GFX6-NEXT: v_lshr_b64 v[14:15], v[8:9], v14 ; GFX6-NEXT: v_lshl_b64 v[16:17], v[10:11], v16 -; GFX6-NEXT: v_lshr_b64 v[18:19], v[8:9], v24 -; GFX6-NEXT: v_not_b32_e32 v25, 63 -; GFX6-NEXT: v_or_b32_e32 v18, v18, v16 -; GFX6-NEXT: v_add_i32_e32 v16, vcc, v23, v25 -; GFX6-NEXT: v_or_b32_e32 v19, v19, v17 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v23 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX6-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX6-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] -; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v24, v25 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v0 -; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v26, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v17, v8 -; GFX6-NEXT: v_and_b32_e32 v17, 0x7f, v20 -; GFX6-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] -; GFX6-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 64, v17 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v3 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v17 -; GFX6-NEXT: v_or_b32_e32 v3, v16, v19 -; GFX6-NEXT: v_add_i32_e32 v16, vcc, v17, v25 -; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v17 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v16 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc -; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14 -; GFX6-NEXT: v_not_b32_e32 v8, v20 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1 -; GFX6-NEXT: v_and_b32_e32 v12, 0x7f, v8 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v12 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v12 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10 -; GFX6-NEXT: v_add_i32_e32 v13, vcc, v12, v25 -; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v12 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v13 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX6-NEXT: v_or_b32_e32 v5, v18, v5 -; GFX6-NEXT: v_or_b32_e32 v6, v17, v6 -; GFX6-NEXT: v_or_b32_e32 v7, v19, v7 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], v18 +; GFX6-NEXT: v_or_b32_e32 v14, v14, v16 +; GFX6-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX6-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc +; GFX6-NEXT: v_or_b32_e32 v0, v24, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v25, v1 +; GFX6-NEXT: v_or_b32_e32 v3, v23, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX6-NEXT: v_or_b32_e32 v6, v6, v10 +; GFX6-NEXT: v_or_b32_e32 v7, v7, v11 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 -; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX8-NEXT: v_not_b32_e32 v16, v16 -; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX8-NEXT: v_lshlrev_b32_e32 v17, 31, v10 -; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v17 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24 -; GFX8-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX8-NEXT: v_not_b32_e32 v18, 63 +; GFX8-NEXT: v_sub_u32_e32 v23, vcc, 64, v19 +; GFX8-NEXT: v_add_u32_e32 v27, vcc, v19, v18 +; GFX8-NEXT: v_lshrrev_b64 v[23:24], v23, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[25:26], v19, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v27, v[0:1] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX8-NEXT: v_or_b32_e32 v19, v23, v25 +; GFX8-NEXT: v_or_b32_e32 v23, v24, v26 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v23, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v17, 0x7f +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] +; GFX8-NEXT: v_bfi_b32 v10, v16, 0, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v24, 0, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v25, 0, v22, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v10, v18 +; GFX8-NEXT: v_sub_u32_e32 v21, vcc, 64, v10 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v10, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v21, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v16, v[2:3] +; GFX8-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX8-NEXT: v_or_b32_e32 v11, v11, v22 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v16 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v16, v[6:7] +; GFX8-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v16, v18 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] +; GFX8-NEXT: v_or_b32_e32 v16, v10, v21 +; GFX8-NEXT: v_or_b32_e32 v21, v11, v22 +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v19, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 31, v14 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[14:15] +; GFX8-NEXT: v_bfi_b32 v14, v20, 0, v17 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v14, v18 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v14 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GFX8-NEXT: v_lshrrev_b64 v[12:13], v14, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[14:15], v14, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] -; GFX8-NEXT: v_not_b32_e32 v25, 63 -; GFX8-NEXT: v_or_b32_e32 v18, v18, v16 -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v23, v25 -; GFX8-NEXT: v_or_b32_e32 v19, v19, v17 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX8-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] -; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v24, v25 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] -; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v26, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v17, v8 -; GFX8-NEXT: v_and_b32_e32 v17, 0x7f, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 64, v17 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v3, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7] -; GFX8-NEXT: v_or_b32_e32 v3, v16, v19 -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v17, v25 -; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc -; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14 -; GFX8-NEXT: v_not_b32_e32 v8, v20 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX8-NEXT: v_and_b32_e32 v12, 0x7f, v8 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v12 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v12, v25 -; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX8-NEXT: v_or_b32_e32 v5, v18, v5 -; GFX8-NEXT: v_or_b32_e32 v6, v17, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v19, v7 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v18, v[10:11] +; GFX8-NEXT: v_or_b32_e32 v14, v14, v16 +; GFX8-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v24, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v25, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v23, v3 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v10 +; GFX8-NEXT: v_or_b32_e32 v7, v7, v11 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v2i128: @@ -7632,17 +7576,17 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 -; GFX9-NEXT: v_not_b32_e32 v16, v16 +; GFX9-NEXT: v_mov_b32_e32 v24, 0x7f ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] ; GFX9-NEXT: v_lshl_or_b32 v9, v10, 31, v9 ; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24 +; GFX9-NEXT: v_bfi_b32 v25, v16, 0, v24 +; GFX9-NEXT: v_sub_u32_e32 v16, 64, v25 ; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX9-NEXT: v_or_b32_e32 v22, v18, v22 ; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] -; GFX9-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], v25, v[8:9] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 ; GFX9-NEXT: v_or_b32_e32 v18, v18, v16 ; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v23 @@ -7650,48 +7594,47 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v25, 0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v24 +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v25 ; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] -; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX9-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20 -; GFX9-NEXT: v_or_b32_e32 v0, v25, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_or_b32_e32 v0, v26, v2 ; GFX9-NEXT: v_or_b32_e32 v2, v17, v8 -; GFX9-NEXT: v_sub_u32_e32 v8, 64, v16 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] -; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16 +; GFX9-NEXT: v_and_b32_e32 v17, 0x7f, v20 +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, 64, v17 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v3, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7] +; GFX9-NEXT: v_or_b32_e32 v3, v16, v19 +; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v17 ; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 ; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v8, vcc +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v17, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v7, vcc -; GFX9-NEXT: v_not_b32_e32 v8, v20 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX9-NEXT: v_and_b32_e32 v13, 0x7f, v8 +; GFX9-NEXT: v_bfi_b32 v13, v20, 0, v24 ; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5 ; GFX9-NEXT: v_sub_u32_e32 v10, 64, v13 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5] @@ -7709,68 +7652,66 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v4, v17, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v16, v4 ; GFX9-NEXT: v_or_b32_e32 v5, v18, v5 -; GFX9-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v17, v6 ; GFX9-NEXT: v_or_b32_e32 v7, v12, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v16 -; GFX10-NEXT: v_not_b32_e32 v21, v16 +; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v16 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v27 -; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v21 -; GFX10-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v27 -; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[2:3] +; GFX10-NEXT: v_bfi_b32 v29, v16, 0, 0x7f +; GFX10-NEXT: v_sub_nc_u32_e32 v21, 64, v19 +; GFX10-NEXT: v_add_nc_u32_e32 v25, 0xffffffc0, v19 +; GFX10-NEXT: v_lshlrev_b64 v[23:24], v19, v[2:3] ; GFX10-NEXT: v_lshl_or_b32 v9, v10, 31, v9 -; GFX10-NEXT: v_lshrrev_b64 v[18:19], v18, v[0:1] ; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28 -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v27, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 -; GFX10-NEXT: v_or_b32_e32 v18, v18, v21 -; GFX10-NEXT: v_add_nc_u32_e32 v21, 0xffffffc0, v28 -; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] -; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] -; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v29, v0, v18, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v0, v19, v22 -; GFX10-NEXT: v_lshrrev_b64 v[18:19], v21, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v21, v23, v25 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v27 -; GFX10-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[21:22], v21, v[0:1] +; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v29 +; GFX10-NEXT: v_lshlrev_b64 v[17:18], v19, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v19 +; GFX10-NEXT: v_lshrrev_b64 v[25:26], v29, v[8:9] +; GFX10-NEXT: v_or_b32_e32 v21, v21, v23 +; GFX10-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v29 +; GFX10-NEXT: v_lshlrev_b64 v[27:28], v16, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e32 v30, 0, v17, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v0, v22, v24 +; GFX10-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11] +; GFX10-NEXT: v_or_b32_e32 v19, v25, v27 +; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v29 +; GFX10-NEXT: v_or_b32_e32 v22, v26, v28 ; GFX10-NEXT: v_cndmask_b32_e32 v23, v1, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v21, s5 -; GFX10-NEXT: v_or_b32_e32 v22, v24, v26 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v21, v29, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v10, v19, v22, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v18, 0, v18, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29 +; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v19, s5 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v29, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v10, v17, v22, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v22, v23, v3, s4 ; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v20 -; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v0, s5 -; GFX10-NEXT: v_or_b32_e32 v0, v16, v2 -; GFX10-NEXT: v_not_b32_e32 v16, v20 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v23 +; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v23 +; GFX10-NEXT: v_bfi_b32 v20, v20, 0, 0x7f +; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v0, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, v1, s5 -; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v16 -; GFX10-NEXT: v_or_b32_e32 v1, v17, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v17, 0xffffffc0, v23 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v0, v30, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v23 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v10, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[10:11], v23, v[6:7] ; GFX10-NEXT: v_lshl_or_b32 v9, v14, 31, v9 ; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] ; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v20 ; GFX10-NEXT: v_lshlrev_b64 v[12:13], v23, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 ; GFX10-NEXT: v_or_b32_e32 v10, v2, v10 ; GFX10-NEXT: v_add_nc_u32_e32 v26, 0xffffffc0, v20 @@ -7807,96 +7748,91 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX11-LABEL: v_fshl_v2i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v27, 0x7f, v16 -; GFX11-NEXT: v_not_b32_e32 v21, v16 +; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v16 ; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b64 v[16:17], v27, v[0:1] -; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v27 -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v27 +; GFX11-NEXT: v_bfi_b32 v29, v16, 0, 0x7f +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b64 v[17:18], v19, v[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v19 ; GFX11-NEXT: v_lshl_or_b32 v9, v10, 31, v9 ; GFX11-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX11-NEXT: v_lshrrev_b64 v[18:19], v18, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc_lo -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] -; GFX11-NEXT: v_and_b32_e32 v28, 0x7f, v21 -; GFX11-NEXT: v_lshlrev_b64 v[21:22], v27, v[2:3] -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v18, v18, v21 -; GFX11-NEXT: v_cndmask_b32_e32 v29, v0, v18, vcc_lo -; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 0xffffffc0, v28 -; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] -; GFX11-NEXT: v_or_b32_e32 v0, v19, v22 -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v28 -; GFX11-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] -; GFX11-NEXT: v_lshrrev_b64 v[18:19], v21, v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v21, v23, v25 +; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v29 +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v30, 0, v17, vcc_lo +; GFX11-NEXT: v_sub_nc_u32_e32 v21, 64, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v18, 0, v18 :: v_dual_add_nc_u32 v25, 0xffffffc0, v19 +; GFX11-NEXT: v_lshlrev_b64 v[23:24], v19, v[2:3] +; GFX11-NEXT: v_lshlrev_b64 v[27:28], v16, v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b64 v[21:22], v21, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[25:26], v29, v[8:9] +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v21, v21, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v29 +; GFX11-NEXT: v_or_b32_e32 v19, v25, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v22, v24 +; GFX11-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11] +; GFX11-NEXT: v_or_b32_e32 v22, v26, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v21, v21, v2, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v23, v1, v0, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v22, v24, v26 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v21, s1 -; GFX11-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: v_cndmask_b32_e64 v21, v29, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v19, v22, s1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29 +; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v19, s1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v29, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v10, v17, v22, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v22, v23, v3, s0 ; GFX11-NEXT: v_and_b32_e32 v23, 0x7f, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v24, 0, v0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, v1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v23 -; GFX11-NEXT: v_or_b32_e32 v0, v16, v2 -; GFX11-NEXT: v_not_b32_e32 v16, v20 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc_lo +; GFX11-NEXT: v_bfi_b32 v20, v20, 0, 0x7f ; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc_lo ; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v23 -; GFX11-NEXT: v_lshlrev_b64 v[12:13], v23, v[4:5] -; GFX11-NEXT: v_and_b32_e32 v20, 0x7f, v16 -; GFX11-NEXT: v_or_b32_e32 v1, v17, v3 -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v23, v[6:7] -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 0xffffffc0, v23 +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v23 +; GFX11-NEXT: v_cndmask_b32_e64 v24, 0, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, v1, s1 +; GFX11-NEXT: v_or_b32_e32 v0, v30, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v23 ; GFX11-NEXT: v_lshl_or_b32 v9, v14, 31, v9 ; GFX11-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] -; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5] -; GFX11-NEXT: v_or_b32_e32 v10, v2, v10 ; GFX11-NEXT: v_add_nc_u32_e32 v26, 0xffffffc0, v20 +; GFX11-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v10, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[10:11], v23, v[6:7] +; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20 +; GFX11-NEXT: v_lshlrev_b64 v[12:13], v23, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 ; GFX11-NEXT: v_lshrrev_b64 v[16:17], v20, v[8:9] +; GFX11-NEXT: v_or_b32_e32 v10, v2, v10 ; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[14:15] ; GFX11-NEXT: v_or_b32_e32 v2, v21, v24 ; GFX11-NEXT: v_or_b32_e32 v11, v3, v11 -; GFX11-NEXT: v_cndmask_b32_e32 v21, v4, v10, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v12, 0, v12 :: v_dual_cndmask_b32 v21, v4, v10 ; GFX11-NEXT: v_lshrrev_b64 v[3:4], v26, v[14:15] -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20 ; GFX11-NEXT: v_or_b32_e32 v10, v16, v18 +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20 ; GFX11-NEXT: v_or_b32_e32 v16, v17, v19 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v23 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v10, s1 ; GFX11-NEXT: v_lshrrev_b64 v[10:11], v20, v[14:15] ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1 +; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v6, v21, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, v8, s2 -; GFX11-NEXT: v_or_b32_e32 v3, v22, v25 ; GFX11-NEXT: v_cndmask_b32_e64 v8, v4, v9, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v10, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v11, s1 +; GFX11-NEXT: v_or_b32_e32 v3, v22, v25 ; GFX11-NEXT: v_or_b32_e32 v4, v12, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v5, v13, v8 ; GFX11-NEXT: v_or_b32_e32 v6, v6, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v7, v7, v10 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 238cc06fc7f7..ea6b3a3ad786 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -398,8 +398,7 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX6-NEXT: v_not_b32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 @@ -785,19 +784,17 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX6-LABEL: v_fshr_v2i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 7, v2 -; GFX6-NEXT: v_not_b32_e32 v2, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v4 -; GFX6-NEXT: v_not_b32_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX6-NEXT: v_bfi_b32 v4, v4, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 @@ -1187,40 +1184,36 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX6-LABEL: v_fshr_v4i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v10, 7, v2 -; GFX6-NEXT: v_not_b32_e32 v2, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v10, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v7 -; GFX6-NEXT: v_not_b32_e32 v7, v7 -; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX6-NEXT: v_bfi_b32 v7, v7, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v7 -; GFX6-NEXT: v_not_b32_e32 v7, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 7, v8 -; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX6-NEXT: v_bfi_b32 v7, v8, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_not_b32_e32 v4, v9 ; GFX6-NEXT: v_and_b32_e32 v3, 7, v9 -; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX6-NEXT: v_bfi_b32 v4, v9, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 @@ -3411,32 +3404,19 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt) define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: s_fshr_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 -; GFX6-NEXT: s_and_b32 s4, s4, 0xffff -; GFX6-NEXT: s_or_b32 s4, s5, s4 -; GFX6-NEXT: s_bfe_u32 s5, s2, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_lshr_b32 s5, s5, 14 -; GFX6-NEXT: s_or_b32 s0, s0, s5 -; GFX6-NEXT: s_bfe_u32 s5, s3, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: s_lshr_b32 s5, s5, 14 -; GFX6-NEXT: s_lshl_b32 s2, s2, 1 -; GFX6-NEXT: s_xor_b32 s4, s4, -1 -; GFX6-NEXT: s_or_b32 s1, s1, s5 -; GFX6-NEXT: s_lshr_b32 s5, s4, 16 ; GFX6-NEXT: s_and_b32 s6, s4, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s0, s0, s6 -; GFX6-NEXT: s_lshr_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s3, s3, 1 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, s4 +; GFX6-NEXT: s_lshr_b32 s2, s2, s6 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s5 -; GFX6-NEXT: s_lshl_b32 s1, s1, s2 -; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s2, s2, s4 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_lshl_b32 s1, s1, s4 +; GFX6-NEXT: s_lshr_b32 s2, s3, s2 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 @@ -3446,33 +3426,22 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; ; GFX8-LABEL: s_fshr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s5, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_lshr_b32 s5, s5, 15 -; GFX8-NEXT: s_lshl_b32 s1, s1, 1 -; GFX8-NEXT: s_or_b32 s0, s0, s5 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: s_lshr_b32 s5, s4, 15 -; GFX8-NEXT: s_xor_b32 s2, s2, -1 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_or_b32 s3, s3, s5 ; GFX8-NEXT: s_lshr_b32 s5, s2, 16 ; GFX8-NEXT: s_and_b32 s6, s2, 15 ; GFX8-NEXT: s_andn2_b32 s2, 15, s2 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 -; GFX8-NEXT: s_lshl_b32 s0, s0, s6 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 -; GFX8-NEXT: s_lshl_b32 s4, s4, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshr_b32 s1, s1, s6 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 15 -; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX8-NEXT: s_andn2_b32 s2, 15, s5 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 -; GFX8-NEXT: s_lshr_b32 s2, s3, s2 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s3, s3, 1 +; GFX8-NEXT: s_lshl_b32 s2, s3, s2 +; GFX8-NEXT: s_lshr_b32 s1, s4, s1 +; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -3554,65 +3523,43 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX6-LABEL: v_fshr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: v_bfe_u32 v5, v2, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 14, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX6-NEXT: v_bfe_u32 v5, v3, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 14, v5 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, 1 -; GFX8-NEXT: v_mov_b32_e32 v5, 15 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v7, -1, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, v4, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v7, v4 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v5, 1, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, 15 ; GFX8-NEXT: v_mov_b32_e32 v5, -1 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v5, 1 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -3664,13 +3611,11 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: v_fshr_v2i16_4_8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 12, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_bfe_u32 v2, v2, 4, 12 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_bfe_u32 v2, v3, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3678,14 +3623,12 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 12, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 4, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 4, v1 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 12, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3723,35 +3666,22 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> %amt) { ; GFX6-LABEL: v_fshr_v2i16_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_bfe_u32 s4, s2, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_lshr_b32 s4, s4, 14 -; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: s_or_b32 s0, s0, s4 -; GFX6-NEXT: s_lshl_b32 s2, s2, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s3, 0xf0001 -; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: s_lshr_b32 s4, s4, 14 -; GFX6-NEXT: s_lshl_b32 s3, s3, 1 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 +; GFX6-NEXT: s_and_b32 s0, s2, 0xffff +; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX6-NEXT: s_or_b32 s1, s1, s4 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 -; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 -; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: s_lshl_b32 s0, s1, 1 +; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX6-NEXT: s_and_b32 s0, s3, 0xffff +; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -3760,36 +3690,24 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; ; GFX8-LABEL: v_fshr_v2i16_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s4, 0xffff, s1 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_lshr_b32 s4, s4, 15 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 1 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 -; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0 -; GFX8-NEXT: s_lshr_b32 s4, s3, 15 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, -1 -; GFX8-NEXT: s_lshl_b32 s2, s2, 1 ; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: s_and_b32 s0, 0xffff, s3 -; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: s_lshl_b32 s0, s2, 1 +; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3845,33 +3763,20 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshr_v2i16_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_bfe_u32 v2, v0, 1, 15 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_and_b32 s4, s2, 15 +; GFX6-NEXT: s_andn2_b32 s2, 15, s2 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 14, v2 -; GFX6-NEXT: v_bfe_u32 v3, v1, 1, 15 -; GFX6-NEXT: s_or_b32 s2, s3, s2 -; GFX6-NEXT: v_or_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_lshl_b32 s0, s1, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v3 -; GFX6-NEXT: v_or_b32_e32 v3, s0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_xor_b32 s0, s2, -1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: s_and_b32 s2, s0, 15 -; GFX6-NEXT: s_andn2_b32 s0, 15, s0 -; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, s2, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, 15 -; GFX6-NEXT: s_andn2_b32 s1, 15, s1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, s0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: s_lshl_b32 s0, s0, s2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX6-NEXT: s_and_b32 s0, s3, 15 +; GFX6-NEXT: s_andn2_b32 s2, 15, s3 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: s_lshl_b32 s1, s1, s2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -3881,31 +3786,21 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, < ; GFX8-LABEL: v_fshr_v2i16_svs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s4, s1, 15 +; GFX8-NEXT: s_andn2_b32 s1, 15, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 15, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 15 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, s4, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX8-NEXT: s_lshl_b32 s0, s2, 1 -; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, 1 -; GFX8-NEXT: s_xor_b32 s0, s1, -1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s2, s0, 15 -; GFX8-NEXT: s_andn2_b32 s0, 15, s0 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, s0, v3 -; GFX8-NEXT: s_and_b32 s0, s1, 15 -; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, s0, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, s2, v1 +; GFX8-NEXT: s_and_b32 s0, s3, 15 +; GFX8-NEXT: s_andn2_b32 s1, 15, s3 +; GFX8-NEXT: s_lshl_b32 s2, s2, 1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_lshl_b32 s1, s2, s1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog @@ -3970,32 +3865,19 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, < define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshr_v2i16_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, 0xffff -; GFX6-NEXT: s_or_b32 s2, s3, s2 -; GFX6-NEXT: s_bfe_u32 s3, s0, 0xf0001 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_lshr_b32 s3, s3, 14 -; GFX6-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX6-NEXT: s_bfe_u32 s3, s1, 0xf0001 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: s_lshr_b32 s3, s3, 14 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_xor_b32 s2, s2, -1 -; GFX6-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX6-NEXT: s_lshr_b32 s3, s2, 16 ; GFX6-NEXT: s_and_b32 s4, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: s_lshr_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX6-NEXT: s_lshr_b32 s0, s0, s4 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s0, s0, s2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s2, v1 +; GFX6-NEXT: s_lshr_b32 s0, s1, s0 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -4005,32 +3887,21 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, < ; ; GFX8-LABEL: v_fshr_v2i16_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s3, 0xffff, s0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v0 -; GFX8-NEXT: s_lshr_b32 s3, s3, 15 -; GFX8-NEXT: v_mov_b32_e32 v2, 1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: s_lshr_b32 s3, s2, 15 -; GFX8-NEXT: s_xor_b32 s1, s1, -1 -; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_and_b32 s4, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v1 -; GFX8-NEXT: s_lshr_b32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s2, s2, 1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v0 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, s1, v1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX8-NEXT: s_and_b32 s0, s3, 15 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, s0, v0 -; GFX8-NEXT: s_and_b32 s0, 0xffff, s2 ; GFX8-NEXT: s_andn2_b32 s1, 15, s3 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, s1, v0 +; GFX8-NEXT: s_lshr_b32 s0, s2, s0 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -4091,46 +3962,26 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, < define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) { ; GFX6-LABEL: s_fshr_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s7, s7, 0xffff -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff -; GFX6-NEXT: s_lshl_b32 s7, s7, 16 -; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_and_b32 s7, s8, 0xffff -; GFX6-NEXT: s_bfe_u32 s8, s3, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_lshr_b32 s8, s8, 14 -; GFX6-NEXT: s_or_b32 s0, s0, s8 -; GFX6-NEXT: s_bfe_u32 s8, s4, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: s_lshr_b32 s8, s8, 14 -; GFX6-NEXT: s_lshl_b32 s3, s3, 1 -; GFX6-NEXT: s_xor_b32 s6, s6, -1 -; GFX6-NEXT: s_or_b32 s1, s1, s8 -; GFX6-NEXT: s_lshr_b32 s8, s6, 16 ; GFX6-NEXT: s_and_b32 s9, s6, 15 ; GFX6-NEXT: s_andn2_b32 s6, 15, s6 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s0, s0, s9 -; GFX6-NEXT: s_lshr_b32 s3, s3, s6 -; GFX6-NEXT: s_lshl_b32 s4, s4, 1 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, s6 +; GFX6-NEXT: s_lshr_b32 s3, s3, s9 ; GFX6-NEXT: s_or_b32 s0, s0, s3 -; GFX6-NEXT: s_and_b32 s3, s8, 15 -; GFX6-NEXT: s_andn2_b32 s6, 15, s8 -; GFX6-NEXT: s_lshl_b32 s1, s1, s3 -; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s3, s3, s6 -; GFX6-NEXT: s_or_b32 s1, s1, s3 -; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001 +; GFX6-NEXT: s_and_b32 s3, s7, 15 +; GFX6-NEXT: s_andn2_b32 s6, 15, s7 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s1, s1, s6 +; GFX6-NEXT: s_lshr_b32 s3, s4, s3 +; GFX6-NEXT: s_andn2_b32 s4, 15, s8 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 -; GFX6-NEXT: s_lshr_b32 s3, s3, 14 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s5, 1 -; GFX6-NEXT: s_xor_b32 s4, s7, -1 -; GFX6-NEXT: s_and_b32 s5, s4, 15 -; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s2, s2, s5 -; GFX6-NEXT: s_lshr_b32 s3, s3, s4 +; GFX6-NEXT: s_or_b32 s1, s1, s3 +; GFX6-NEXT: s_and_b32 s3, s8, 15 +; GFX6-NEXT: s_lshl_b32 s2, s2, s4 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff +; GFX6-NEXT: s_lshr_b32 s3, s4, s3 ; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 @@ -4141,43 +3992,26 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, < ; ; GFX8-LABEL: s_fshr_v3i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s8, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_lshr_b32 s8, s8, 15 -; GFX8-NEXT: s_lshl_b32 s2, s2, 1 -; GFX8-NEXT: s_or_b32 s0, s0, s8 -; GFX8-NEXT: s_lshl_b32 s6, s6, 1 -; GFX8-NEXT: s_lshr_b32 s8, s7, 15 -; GFX8-NEXT: s_xor_b32 s4, s4, -1 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_or_b32 s6, s6, s8 ; GFX8-NEXT: s_lshr_b32 s8, s4, 16 ; GFX8-NEXT: s_and_b32 s9, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 -; GFX8-NEXT: s_lshl_b32 s0, s0, s9 -; GFX8-NEXT: s_lshr_b32 s2, s2, s4 -; GFX8-NEXT: s_lshl_b32 s7, s7, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: s_lshr_b32 s2, s2, s9 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s8, 15 -; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_and_b32 s6, 0xffff, s7 ; GFX8-NEXT: s_andn2_b32 s4, 15, s8 -; GFX8-NEXT: s_lshr_b32 s6, s6, 1 -; GFX8-NEXT: s_lshr_b32 s4, s6, s4 -; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_and_b32 s4, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s6, s6, 1 +; GFX8-NEXT: s_lshl_b32 s4, s6, s4 +; GFX8-NEXT: s_lshr_b32 s2, s7, s2 +; GFX8-NEXT: s_or_b32 s2, s4, s2 +; GFX8-NEXT: s_and_b32 s4, s5, 15 +; GFX8-NEXT: s_andn2_b32 s5, 15, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 1 -; GFX8-NEXT: s_lshr_b32 s4, s4, 15 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_xor_b32 s4, s5, -1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_and_b32 s5, s4, 15 -; GFX8-NEXT: s_andn2_b32 s4, 15, s4 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 @@ -4332,92 +4166,58 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX6-LABEL: v_fshr_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX6-NEXT: v_bfe_u32 v8, v3, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 14, v8 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX6-NEXT: v_bfe_u32 v8, v4, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 14, v8 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX6-NEXT: v_and_b32_e32 v9, 15, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v9, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 15, v8 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8 +; GFX6-NEXT: v_and_b32_e32 v3, 15, v7 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1 -; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v3 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v7 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 15, v4 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v6, v1 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 15, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v3i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, 15, v2 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX8-NEXT: v_mov_b32_e32 v7, 1 -; GFX8-NEXT: v_mov_b32_e32 v8, 15 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_sdwa v9, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v10, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v10, 15, v10 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, v7, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v9 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, v10, v7 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX8-NEXT: v_and_b32_sdwa v7, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_lshlrev_b16_e32 v8, 1, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v8 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, 15 ; GFX8-NEXT: v_mov_b32_e32 v8, -1 +; GFX8-NEXT: v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v8, 1 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 15, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v5 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v4, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v3, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v3 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -4491,64 +4291,38 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) { ; GFX6-LABEL: s_fshr_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s9, s9, 16 -; GFX6-NEXT: s_and_b32 s8, s8, 0xffff -; GFX6-NEXT: s_or_b32 s8, s9, s8 -; GFX6-NEXT: s_lshl_b32 s9, s11, 16 -; GFX6-NEXT: s_and_b32 s10, s10, 0xffff -; GFX6-NEXT: s_or_b32 s9, s9, s10 -; GFX6-NEXT: s_bfe_u32 s10, s4, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_lshr_b32 s10, s10, 14 -; GFX6-NEXT: s_or_b32 s0, s0, s10 -; GFX6-NEXT: s_bfe_u32 s10, s5, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: s_lshr_b32 s10, s10, 14 -; GFX6-NEXT: s_lshl_b32 s4, s4, 1 -; GFX6-NEXT: s_xor_b32 s8, s8, -1 -; GFX6-NEXT: s_or_b32 s1, s1, s10 -; GFX6-NEXT: s_lshr_b32 s10, s8, 16 -; GFX6-NEXT: s_and_b32 s11, s8, 15 +; GFX6-NEXT: s_and_b32 s12, s8, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s0, s0, s11 -; GFX6-NEXT: s_lshr_b32 s4, s4, s8 -; GFX6-NEXT: s_lshl_b32 s5, s5, 1 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, s8 +; GFX6-NEXT: s_lshr_b32 s4, s4, s12 ; GFX6-NEXT: s_or_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s4, s10, 15 -; GFX6-NEXT: s_andn2_b32 s8, 15, s10 -; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s4, s4, s8 +; GFX6-NEXT: s_and_b32 s4, s9, 15 +; GFX6-NEXT: s_andn2_b32 s8, 15, s9 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s1, s1, s8 +; GFX6-NEXT: s_lshr_b32 s4, s5, s4 ; GFX6-NEXT: s_or_b32 s1, s1, s4 ; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_andn2_b32 s4, 15, s10 +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, 1 -; GFX6-NEXT: s_bfe_u32 s2, s6, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s2, s2, 14 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s3, 1 -; GFX6-NEXT: s_bfe_u32 s3, s7, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s3, s3, 14 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s6, 1 -; GFX6-NEXT: s_xor_b32 s5, s9, -1 -; GFX6-NEXT: s_lshl_b32 s4, s7, 1 -; GFX6-NEXT: s_lshr_b32 s6, s5, 16 -; GFX6-NEXT: s_and_b32 s7, s5, 15 -; GFX6-NEXT: s_andn2_b32 s5, 15, s5 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s1, s1, s7 -; GFX6-NEXT: s_lshr_b32 s3, s3, s5 -; GFX6-NEXT: s_or_b32 s1, s1, s3 -; GFX6-NEXT: s_and_b32 s3, s6, 15 -; GFX6-NEXT: s_andn2_b32 s5, 15, s6 -; GFX6-NEXT: s_lshl_b32 s2, s2, s3 -; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s3, s3, s5 -; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s10, 15 +; GFX6-NEXT: s_lshl_b32 s2, s2, s4 +; GFX6-NEXT: s_and_b32 s4, s6, 0xffff +; GFX6-NEXT: s_lshr_b32 s1, s4, s1 +; GFX6-NEXT: s_andn2_b32 s4, 15, s11 +; GFX6-NEXT: s_lshl_b32 s3, s3, 1 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s2, s11, 15 +; GFX6-NEXT: s_lshl_b32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff +; GFX6-NEXT: s_lshr_b32 s2, s4, s2 +; GFX6-NEXT: s_or_b32 s2, s3, s2 ; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 @@ -4557,63 +4331,41 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; ; GFX8-LABEL: s_fshr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s8, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_lshr_b32 s8, s8, 15 -; GFX8-NEXT: s_lshl_b32 s2, s2, 1 -; GFX8-NEXT: s_or_b32 s0, s0, s8 -; GFX8-NEXT: s_lshl_b32 s6, s6, 1 -; GFX8-NEXT: s_lshr_b32 s8, s7, 15 -; GFX8-NEXT: s_xor_b32 s4, s4, -1 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_or_b32 s6, s6, s8 ; GFX8-NEXT: s_lshr_b32 s8, s4, 16 ; GFX8-NEXT: s_and_b32 s9, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 -; GFX8-NEXT: s_lshl_b32 s0, s0, s9 -; GFX8-NEXT: s_lshr_b32 s2, s2, s4 -; GFX8-NEXT: s_lshl_b32 s7, s7, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: s_lshr_b32 s2, s2, s9 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s8, 15 -; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_and_b32 s6, 0xffff, s7 ; GFX8-NEXT: s_andn2_b32 s4, 15, s8 -; GFX8-NEXT: s_lshr_b32 s6, s6, 1 -; GFX8-NEXT: s_lshr_b32 s4, s6, s4 -; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_lshl_b32 s6, s6, 1 +; GFX8-NEXT: s_lshl_b32 s4, s6, s4 +; GFX8-NEXT: s_lshr_b32 s2, s7, s2 +; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s6, 0xffff, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_lshr_b32 s2, s1, 16 ; GFX8-NEXT: s_lshr_b32 s4, s3, 16 -; GFX8-NEXT: s_lshl_b32 s1, s1, 1 -; GFX8-NEXT: s_lshr_b32 s6, s6, 15 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: s_or_b32 s1, s1, s6 -; GFX8-NEXT: s_lshl_b32 s2, s2, 1 -; GFX8-NEXT: s_lshr_b32 s6, s4, 15 -; GFX8-NEXT: s_xor_b32 s5, s5, -1 -; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshr_b32 s6, s5, 16 ; GFX8-NEXT: s_and_b32 s7, s5, 15 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, s7 -; GFX8-NEXT: s_lshr_b32 s3, s3, s5 -; GFX8-NEXT: s_lshl_b32 s4, s4, 1 +; GFX8-NEXT: s_lshl_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s1, s1, s5 +; GFX8-NEXT: s_lshr_b32 s3, s3, s7 ; GFX8-NEXT: s_or_b32 s1, s1, s3 ; GFX8-NEXT: s_and_b32 s3, s6, 15 -; GFX8-NEXT: s_lshl_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX8-NEXT: s_andn2_b32 s5, 15, s6 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 -; GFX8-NEXT: s_lshr_b32 s3, s3, s5 +; GFX8-NEXT: s_lshl_b32 s2, s2, 1 +; GFX8-NEXT: s_lshl_b32 s2, s2, s5 +; GFX8-NEXT: s_lshr_b32 s3, s4, s3 ; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 @@ -4749,120 +4501,76 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX6-LABEL: v_fshr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX6-NEXT: v_or_b32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; GFX6-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX6-NEXT: v_bfe_u32 v10, v4, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 14, v10 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX6-NEXT: v_bfe_u32 v10, v5, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 14, v10 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v10 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX6-NEXT: v_and_b32_e32 v11, 15, v8 +; GFX6-NEXT: v_and_b32_e32 v12, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v11, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v8, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v12, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v9 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v9 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v5 +; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v8, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 14, v4 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v5 +; GFX6-NEXT: v_xor_b32_e32 v5, -1, v11 +; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 14, v4 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v6 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v9 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX6-NEXT: v_and_b32_e32 v8, 15, v6 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v7 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 -; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 -; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v11 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, 15, v2 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX8-NEXT: v_mov_b32_e32 v7, 1 -; GFX8-NEXT: v_mov_b32_e32 v8, 15 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_sdwa v9, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v2 -; GFX8-NEXT: v_xor_b32_e32 v11, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v10, 15, v4 -; GFX8-NEXT: v_and_b32_e32 v11, 15, v11 -; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v9 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, v10, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v9, v11, v9 -; GFX8-NEXT: v_mov_b32_e32 v10, -1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v9 -; GFX8-NEXT: v_and_b32_sdwa v9, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_lshlrev_b16_e32 v8, 1, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v8 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, 15 +; GFX8-NEXT: v_mov_b32_e32 v9, -1 +; GFX8-NEXT: v_and_b32_sdwa v8, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v10, 1 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v9, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v3 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_lshlrev_b16_e32 v4, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v7, -1, v5 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v6, 15, v5 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v4 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, v6, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v7, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_and_b32_sdwa v4, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_xor_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_and_b32_sdwa v4, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, v4, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -5052,8 +4760,7 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GFX6-NEXT: v_not_b32_e32 v5, v4 -; GFX6-NEXT: v_and_b32_e32 v5, 63, v5 +; GFX6-NEXT: v_bfi_b32 v5, v4, 0, 63 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 @@ -5065,8 +4772,7 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX8-NEXT: v_not_b32_e32 v5, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 63, v5 +; GFX8-NEXT: v_bfi_b32 v5, v4, 0, 63 ; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] @@ -5078,8 +4784,7 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: v_not_b32_e32 v5, v4 -; GFX9-NEXT: v_and_b32_e32 v5, 63, v5 +; GFX9-NEXT: v_bfi_b32 v5, v4, 0, 63 ; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] @@ -5090,12 +4795,11 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX10-LABEL: v_fshr_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v5, v4 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX10-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX10-NEXT: v_bfi_b32 v4, v4, 0, 63 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5103,16 +4807,14 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX11-LABEL: v_fshr_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v5, v4 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v5, 63, v5 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX11-NEXT: v_bfi_b32 v4, v4, 0, 63 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) @@ -5228,9 +4930,8 @@ define i64 @v_fshr_i64_48(i64 %lhs, i64 %rhs) { define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 %amt) { ; GFX6-LABEL: v_fshr_i64_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_not_b32_e32 v1, v0 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX6-NEXT: v_and_b32_e32 v1, 63, v1 +; GFX6-NEXT: v_bfi_b32 v1, v0, 0, 63 ; GFX6-NEXT: v_and_b32_e32 v0, 63, v0 ; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v1 ; GFX6-NEXT: v_lshr_b64 v[3:4], s[2:3], v0 @@ -5240,9 +4941,8 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; ; GFX8-LABEL: v_fshr_i64_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_not_b32_e32 v1, v0 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX8-NEXT: v_and_b32_e32 v1, 63, v1 +; GFX8-NEXT: v_bfi_b32 v1, v0, 0, 63 ; GFX8-NEXT: v_and_b32_e32 v0, 63, v0 ; GFX8-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[3:4], v0, s[2:3] @@ -5252,9 +4952,8 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; ; GFX9-LABEL: v_fshr_i64_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_not_b32_e32 v1, v0 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX9-NEXT: v_and_b32_e32 v1, 63, v1 +; GFX9-NEXT: v_bfi_b32 v1, v0, 0, 63 ; GFX9-NEXT: v_and_b32_e32 v0, 63, v0 ; GFX9-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], v0, s[2:3] @@ -5264,29 +4963,27 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; ; GFX10-LABEL: v_fshr_i64_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_not_b32_e32 v1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX10-NEXT: v_bfi_b32 v1, v0, 0, 63 +; GFX10-NEXT: v_and_b32_e32 v2, 63, v0 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: v_and_b32_e32 v2, 63, v1 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] -; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshr_i64_ssv: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_not_b32_e32 v1, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX11-NEXT: v_bfi_b32 v1, v0, 0, 63 +; GFX11-NEXT: v_and_b32_e32 v2, 63, v0 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v2, 63, v1 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] -; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) %cast = bitcast i64 %result to <2 x float> @@ -5492,15 +5189,13 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GFX6-NEXT: v_not_b32_e32 v9, v8 -; GFX6-NEXT: v_and_b32_e32 v9, 63, v9 +; GFX6-NEXT: v_bfi_b32 v9, v8, 0, 63 ; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX6-NEXT: v_not_b32_e32 v4, v10 -; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX6-NEXT: v_bfi_b32 v4, v10, 0, 63 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v10 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4 @@ -5513,15 +5208,13 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX8-NEXT: v_not_b32_e32 v9, v8 -; GFX8-NEXT: v_and_b32_e32 v9, 63, v9 +; GFX8-NEXT: v_bfi_b32 v9, v8, 0, 63 ; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_not_b32_e32 v4, v10 -; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX8-NEXT: v_bfi_b32 v4, v10, 0, 63 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] ; GFX8-NEXT: v_and_b32_e32 v4, 63, v10 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] @@ -5534,15 +5227,13 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: v_not_b32_e32 v9, v8 -; GFX9-NEXT: v_and_b32_e32 v9, 63, v9 +; GFX9-NEXT: v_bfi_b32 v9, v8, 0, 63 ; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX9-NEXT: v_not_b32_e32 v4, v10 -; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX9-NEXT: v_bfi_b32 v4, v10, 0, 63 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] ; GFX9-NEXT: v_and_b32_e32 v4, 63, v10 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] @@ -5554,16 +5245,14 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX10-LABEL: v_fshr_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v9, v8 -; GFX10-NEXT: v_not_b32_e32 v11, v10 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX10-NEXT: v_bfi_b32 v9, v8, 0, 63 ; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX10-NEXT: v_and_b32_e32 v9, 63, v9 -; GFX10-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX10-NEXT: v_bfi_b32 v11, v10, 0, 63 ; GFX10-NEXT: v_and_b32_e32 v10, 63, v10 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 @@ -5575,17 +5264,15 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX11-LABEL: v_fshr_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v9, v8 -; GFX11-NEXT: v_not_b32_e32 v11, v10 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX11-NEXT: v_bfi_b32 v9, v8, 0, 63 ; GFX11-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX11-NEXT: v_and_b32_e32 v9, 63, v9 -; GFX11-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX11-NEXT: v_bfi_b32 v11, v10, 0, 63 ; GFX11-NEXT: v_and_b32_e32 v10, 63, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] @@ -5848,8 +5535,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-NEXT: v_lshl_b64 v[9:10], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX6-NEXT: v_not_b32_e32 v0, v8 -; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0x7f +; GFX6-NEXT: v_bfi_b32 v15, v8, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v15 ; GFX6-NEXT: v_not_b32_e32 v16, 63 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[9:10], v0 @@ -5897,8 +5584,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX8-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX8-NEXT: v_not_b32_e32 v0, v8 -; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x7f +; GFX8-NEXT: v_bfi_b32 v15, v8, 0, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v15 ; GFX8-NEXT: v_not_b32_e32 v16, 63 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10] @@ -5946,8 +5633,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX9-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v8 -; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7f +; GFX9-NEXT: v_bfi_b32 v15, v8, 0, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v15 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10] ; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] @@ -5990,107 +5677,103 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-LABEL: v_fshr_i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v9, v8 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 31, v1 +; GFX10-NEXT: v_bfi_b32 v18, v8, 0, 0x7f +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 31, v1 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: v_and_b32_e32 v21, 0x7f, v8 -; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v9 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v21 -; GFX10-NEXT: v_sub_nc_u32_e32 v12, 64, v20 -; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v20 -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v20, v[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[8:9], v20, v[0:1] -; GFX10-NEXT: v_add_nc_u32_e32 v18, 0xffffffc0, v21 -; GFX10-NEXT: v_lshrrev_b64 v[12:13], v12, v[0:1] +; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18 +; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 +; GFX10-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v10, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3] ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[14:15], v21, v[4:5] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 +; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v20 -; GFX10-NEXT: v_lshrrev_b64 v[18:19], v18, v[6:7] -; GFX10-NEXT: v_or_b32_e32 v10, v12, v10 -; GFX10-NEXT: v_or_b32_e32 v11, v13, v11 -; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v21 -; GFX10-NEXT: v_or_b32_e32 v12, v15, v17 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo +; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v19 +; GFX10-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19 +; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v18 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7] ; GFX10-NEXT: v_or_b32_e32 v0, v14, v16 +; GFX10-NEXT: v_or_b32_e32 v10, v15, v17 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v21 -; GFX10-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v18, v0, s5 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v21, v[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v6, v19, v12, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v20, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s5 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v13, v4, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v1, s5 -; GFX10-NEXT: v_or_b32_e32 v0, v8, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s5 +; GFX10-NEXT: v_or_b32_e32 v0, v12, v4 ; GFX10-NEXT: v_or_b32_e32 v1, v7, v5 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v9, v8 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 31, v1 +; GFX11-NEXT: v_bfi_b32 v18, v8, 0, 0x7f +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 31, v1 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v20, 0x7f, v9 -; GFX11-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_sub_nc_u32_e32 v12, 64, v20 -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v20 -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v20, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b64 v[12:13], v12, v[0:1] -; GFX11-NEXT: v_or_b32_e32 v10, v12, v10 -; GFX11-NEXT: v_and_b32_e32 v21, 0x7f, v8 -; GFX11-NEXT: v_lshlrev_b64 v[8:9], v20, v[0:1] -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v20 -; GFX11-NEXT: v_or_b32_e32 v11, v13, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo -; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 0xffffffc0, v21 -; GFX11-NEXT: v_lshrrev_b64 v[14:15], v21, v[4:5] -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v21 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 +; GFX11-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v10, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3] +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v19 ; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] -; GFX11-NEXT: v_lshrrev_b64 v[18:19], v18, v[6:7] -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v20 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v21 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v18 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7] ; GFX11-NEXT: v_or_b32_e32 v0, v14, v16 -; GFX11-NEXT: v_or_b32_e32 v12, v15, v17 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v10, v15, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v20, v2, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v0, s1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v10, s1 +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v13, v18, v0, s1 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v21, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v6, v19, v12, s1 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v13, v4, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1 -; GFX11-NEXT: v_or_b32_e32 v0, v8, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s1 +; GFX11-NEXT: v_or_b32_e32 v0, v12, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v1, v7, v5 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v8 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) ret i128 %result @@ -6099,12 +5782,12 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { ; GFX6-LABEL: v_fshr_i128_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_not_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x7f ; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_lshr_b32 s0, s1, 31 ; GFX6-NEXT: s_mov_b32 s1, 0 -; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v1 +; GFX6-NEXT: v_bfi_b32 v7, v0, 0, v1 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7 ; GFX6-NEXT: v_not_b32_e32 v8, 63 @@ -6152,12 +5835,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; ; GFX8-LABEL: v_fshr_i128_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_not_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x7f ; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_lshr_b32 s0, s1, 31 ; GFX8-NEXT: s_mov_b32 s1, 0 -; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v1 +; GFX8-NEXT: v_bfi_b32 v7, v0, 0, v1 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7 ; GFX8-NEXT: v_not_b32_e32 v8, 63 @@ -6205,12 +5888,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; ; GFX9-LABEL: v_fshr_i128_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_not_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f ; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 31 ; GFX9-NEXT: s_mov_b32 s1, 0 -; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v1 +; GFX9-NEXT: v_bfi_b32 v7, v0, 0, v1 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7 ; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9] @@ -6257,101 +5940,99 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; ; GFX10-LABEL: v_fshr_i128_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_not_b32_e32 v1, v0 -; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: v_bfi_b32 v11, v0, 0, 0x7f ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX10-NEXT: s_lshr_b32 s8, s1, 31 -; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0 -; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v1 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v11 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] -; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v13 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 0xffffffc0, v12 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] -; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13 -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[10:11] -; GFX10-NEXT: v_lshlrev_b64 v[6:7], v6, s[10:11] -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v13, s[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[6:7] -; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v13 -; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[10:11] -; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_or_b32_e32 v2, v3, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v12 -; GFX10-NEXT: v_or_b32_e32 v3, v8, v10 -; GFX10-NEXT: v_or_b32_e32 v8, v9, v11 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xffffffc0, v11 +; GFX10-NEXT: v_lshlrev_b64 v[3:4], v11, s[8:9] +; GFX10-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] +; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v12 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11 +; GFX10-NEXT: v_add_nc_u32_e32 v13, 0xffffffc0, v12 +; GFX10-NEXT: v_lshrrev_b64 v[7:8], v12, s[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1] +; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7] +; GFX10-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v0, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1 -; GFX10-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v7, v9 +; GFX10-NEXT: v_or_b32_e32 v7, v8, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v14, s8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s5, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1 +; GFX10-NEXT: v_or_b32_e32 v0, v5, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v6, v3 +; GFX10-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX10-NEXT: v_or_b32_e32 v3, v4, v9 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshr_i128_ssv: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_not_b32_e32 v1, v0 -; GFX11-NEXT: s_mov_b32 s9, 0 -; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: v_bfi_b32 v11, v0, 0, 0x7f ; GFX11-NEXT: s_lshr_b32 s8, s1, 31 -; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v0 -; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v1 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_mov_b32 s9, 0 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 64, v11 +; GFX11-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v0 ; GFX11-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v13 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xffffffc0, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[10:11] -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v13, s[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[6:7] -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v6, s[10:11] -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v13 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v13 -; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v1 -; GFX11-NEXT: v_or_b32_e32 v3, v8, v10 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, s[10:11] -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v12 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1 +; GFX11-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v11, s[8:9] +; GFX11-NEXT: v_dual_cndmask_b32 v5, 0, v5 :: v_dual_add_nc_u32 v0, 0xffffffc0, v11 +; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v12 +; GFX11-NEXT: v_lshrrev_b64 v[7:8], v12, s[4:5] +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v12 +; GFX11-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0xffffffc0, v12 +; GFX11-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7] +; GFX11-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v0, v3, vcc_lo ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1 -; GFX11-NEXT: v_dual_cndmask_b32 v4, 0, v4 :: v_dual_cndmask_b32 v5, 0, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s8, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s9, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v7, v9 +; GFX11-NEXT: v_or_b32_e32 v7, v8, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v14, s8, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s5, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1 +; GFX11-NEXT: v_or_b32_e32 v0, v5, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v6, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v4, v9 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -7486,226 +7167,224 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_lshl_b64 v[17:18], v[0:1], 1 +; GFX6-NEXT: v_mov_b32_e32 v18, 0x7f +; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GFX6-NEXT: v_bfi_b32 v19, v16, 0, v18 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX6-NEXT: v_not_b32_e32 v0, v16 -; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v0 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v19 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[17:18], v0 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 -; GFX6-NEXT: v_and_b32_e32 v25, 0x7f, v16 -; GFX6-NEXT: v_or_b32_e32 v23, v0, v21 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v25 -; GFX6-NEXT: v_or_b32_e32 v24, v1, v22 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[10:11], v0 -; GFX6-NEXT: v_lshr_b64 v[21:22], v[8:9], v25 -; GFX6-NEXT: v_not_b32_e32 v26, 63 -; GFX6-NEXT: v_or_b32_e32 v21, v21, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v19, v26 -; GFX6-NEXT: v_or_b32_e32 v22, v22, v1 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v0 +; GFX6-NEXT: v_not_b32_e32 v17, 63 +; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 64, v19 +; GFX6-NEXT: v_add_i32_e32 v27, vcc, v19, v17 +; GFX6-NEXT: v_lshr_b64 v[23:24], v[21:22], v23 +; GFX6-NEXT: v_lshl_b64 v[25:26], v[2:3], v19 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[21:22], v19 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[21:22], v27 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] -; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v25, v26 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[17:18], v19 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v0 -; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 -; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v25 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 -; GFX6-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc +; GFX6-NEXT: v_or_b32_e32 v19, v23, v25 +; GFX6-NEXT: v_or_b32_e32 v23, v24, v26 +; GFX6-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v21, v19, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v22, v23, vcc +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v2, v17 +; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 64, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5] +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v2 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[8:9], v2 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[10:11], v21 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5] -; GFX6-NEXT: v_or_b32_e32 v0, v16, v8 -; GFX6-NEXT: v_or_b32_e32 v1, v17, v9 +; GFX6-NEXT: v_or_b32_e32 v21, v2, v21 +; GFX6-NEXT: v_or_b32_e32 v22, v3, v22 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v16 +; GFX6-NEXT: v_bfi_b32 v16, v20, 0, v18 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v22, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; GFX6-NEXT: v_or_b32_e32 v0, v24, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v25, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v19, v8 +; GFX6-NEXT: v_or_b32_e32 v3, v23, v9 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX6-NEXT: v_not_b32_e32 v4, v20 -; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v4 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v16 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16 -; GFX6-NEXT: v_add_i32_e32 v17, vcc, v16, v26 -; GFX6-NEXT: v_or_b32_e32 v10, v4, v10 -; GFX6-NEXT: v_or_b32_e32 v11, v5, v11 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v16 +; GFX6-NEXT: v_add_i32_e32 v21, vcc, v16, v17 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[8:9], v10 +; GFX6-NEXT: v_lshl_b64 v[18:19], v[6:7], v16 ; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v16 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v17 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v21 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v18 -; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc +; GFX6-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc +; GFX6-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX6-NEXT: v_cndmask_b32_e64 v10, v4, v6, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v20 +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v20 -; GFX6-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v10 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v10 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[14:15], v6 -; GFX6-NEXT: v_add_i32_e32 v11, vcc, v10, v26 -; GFX6-NEXT: v_or_b32_e32 v16, v4, v6 -; GFX6-NEXT: v_or_b32_e32 v19, v5, v7 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v11 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v10 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v6, v17 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v6 +; GFX6-NEXT: v_cndmask_b32_e64 v11, v5, v7, s[4:5] +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v6 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[12:13], v6 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[14:15], v8 +; GFX6-NEXT: v_or_b32_e32 v8, v6, v8 +; GFX6-NEXT: v_or_b32_e32 v9, v7, v9 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v17 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc -; GFX6-NEXT: v_or_b32_e32 v4, v17, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX6-NEXT: v_or_b32_e32 v4, v16, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v18, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v7, v9, v11 +; GFX6-NEXT: v_or_b32_e32 v6, v10, v8 +; GFX6-NEXT: v_or_b32_e32 v7, v11, v9 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v18, 0x7f +; GFX8-NEXT: v_lshlrev_b64 v[21:22], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GFX8-NEXT: v_bfi_b32 v19, v16, 0, v18 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX8-NEXT: v_not_b32_e32 v0, v16 -; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v19 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v25, 0x7f, v16 -; GFX8-NEXT: v_or_b32_e32 v23, v0, v21 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v25 -; GFX8-NEXT: v_or_b32_e32 v24, v1, v22 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9] -; GFX8-NEXT: v_not_b32_e32 v26, 63 -; GFX8-NEXT: v_or_b32_e32 v21, v21, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v19, v26 -; GFX8-NEXT: v_or_b32_e32 v22, v22, v1 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18] +; GFX8-NEXT: v_not_b32_e32 v17, 63 +; GFX8-NEXT: v_sub_u32_e32 v23, vcc, 64, v19 +; GFX8-NEXT: v_add_u32_e32 v27, vcc, v19, v17 +; GFX8-NEXT: v_lshrrev_b64 v[23:24], v23, v[21:22] +; GFX8-NEXT: v_lshlrev_b64 v[25:26], v19, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v19, v[21:22] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v27, v[21:22] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] -; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v25, v26 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18] -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] -; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 -; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc +; GFX8-NEXT: v_or_b32_e32 v19, v23, v25 +; GFX8-NEXT: v_or_b32_e32 v23, v24, v26 +; GFX8-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v21, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5] +; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v22, v23, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v2, v17 +; GFX8-NEXT: v_sub_u32_e32 v21, vcc, 64, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v2, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v21, v[10:11] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v0, v16, v8 -; GFX8-NEXT: v_or_b32_e32 v1, v17, v9 +; GFX8-NEXT: v_or_b32_e32 v21, v2, v21 +; GFX8-NEXT: v_or_b32_e32 v22, v3, v22 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v16, v[10:11] +; GFX8-NEXT: v_bfi_b32 v16, v20, 0, v18 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v22, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v24, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v25, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v19, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v23, v9 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX8-NEXT: v_not_b32_e32 v4, v20 -; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v4 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v16 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] -; GFX8-NEXT: v_add_u32_e32 v17, vcc, v16, v26 -; GFX8-NEXT: v_or_b32_e32 v10, v4, v10 -; GFX8-NEXT: v_or_b32_e32 v11, v5, v11 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v16 +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v16, v17 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[18:19], v16, v[6:7] ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v21, v[8:9] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc +; GFX8-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX8-NEXT: v_cndmask_b32_e64 v10, v4, v6, s[4:5] +; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v20 +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v10 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[12:13] -; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX8-NEXT: v_add_u32_e32 v11, vcc, v10, v26 -; GFX8-NEXT: v_or_b32_e32 v16, v4, v6 -; GFX8-NEXT: v_or_b32_e32 v19, v5, v7 -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v11, v[14:15] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[14:15] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v6, v17 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v5, v7, s[4:5] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v6, v[14:15] +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v6, v[12:13] +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v8, v[14:15] +; GFX8-NEXT: v_or_b32_e32 v8, v6, v8 +; GFX8-NEXT: v_or_b32_e32 v9, v7, v9 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v17, v[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v17, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v4, v16, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v18, v7 -; GFX8-NEXT: v_or_b32_e32 v6, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v7, v9, v11 +; GFX8-NEXT: v_or_b32_e32 v6, v10, v8 +; GFX8-NEXT: v_or_b32_e32 v7, v11, v9 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v19, 0x7f ; GFX9-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GFX9-NEXT: v_bfi_b32 v23, v16, 0, v19 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v16 -; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v19 +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v23 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] -; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v25, 0x7f, v16 -; GFX9-NEXT: v_or_b32_e32 v23, v0, v21 -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v25 -; GFX9-NEXT: v_or_b32_e32 v24, v1, v22 +; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v26, 0x7f, v16 +; GFX9-NEXT: v_or_b32_e32 v24, v0, v21 +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v26 +; GFX9-NEXT: v_or_b32_e32 v25, v1, v22 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11] -; GFX9-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], v26, v[8:9] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 ; GFX9-NEXT: v_or_b32_e32 v21, v21, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v19 +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v23 ; GFX9-NEXT: v_or_b32_e32 v22, v22, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v24, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v25, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] -; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v25 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18] +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v26 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v23, v[17:18] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] -; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v21, v1, v22, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v26, v[10:11] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v21, v9, vcc ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] @@ -7713,9 +7392,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_or_b32_e32 v1, v17, v9 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 +; GFX9-NEXT: v_bfi_b32 v16, v20, 0, v19 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX9-NEXT: v_not_b32_e32 v4, v20 -; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v4 ; GFX9-NEXT: v_sub_u32_e32 v4, 64, v16 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v11 @@ -7760,14 +7438,12 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-LABEL: v_fshr_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v17, v16 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v26, 0x7f, v16 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v17 +; GFX10-NEXT: v_bfi_b32 v25, v16, 0, 0x7f ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26 +; GFX10-NEXT: v_and_b32_e32 v26, 0x7f, v16 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v25 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX10-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v25 @@ -7776,54 +7452,54 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[21:22], v25, v[2:3] ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1] -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v25 +; GFX10-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26 ; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v23, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v25 ; GFX10-NEXT: v_cndmask_b32_e32 v24, 0, v24, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v22, v18, v22 ; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v26 ; GFX10-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9] +; GFX10-NEXT: v_bfi_b32 v25, v20, 0, 0x7f ; GFX10-NEXT: v_cndmask_b32_e32 v22, v1, v22, vcc_lo ; GFX10-NEXT: v_lshlrev_b64 v[18:19], v18, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v26 ; GFX10-NEXT: v_cndmask_b32_e64 v22, v22, v3, s4 +; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v20 ; GFX10-NEXT: v_or_b32_e32 v16, v16, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v2, s4 ; GFX10-NEXT: v_or_b32_e32 v17, v17, v19 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v26 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo -; GFX10-NEXT: v_not_b32_e32 v16, v20 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 31, v5 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s4 -; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v16 +; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v25 ; GFX10-NEXT: v_or_b32_e32 v6, v6, v10 -; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v26, 0, v2, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v25 ; GFX10-NEXT: v_cndmask_b32_e32 v27, 0, v3, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v20 -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5] ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v8, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v25, v[6:7] +; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v20 +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] ; GFX10-NEXT: v_or_b32_e32 v0, v23, v0 ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 +; GFX10-NEXT: v_or_b32_e32 v8, v2, v8 ; GFX10-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v20 ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v20, v[12:13] -; GFX10-NEXT: v_or_b32_e32 v8, v2, v8 ; GFX10-NEXT: v_lshlrev_b64 v[18:19], v18, v[14:15] ; GFX10-NEXT: v_or_b32_e32 v2, v21, v26 ; GFX10-NEXT: v_or_b32_e32 v9, v3, v9 -; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v20 ; GFX10-NEXT: v_cndmask_b32_e32 v21, v4, v8, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[3:4], v23, v[14:15] +; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v20 ; GFX10-NEXT: v_or_b32_e32 v8, v16, v18 ; GFX10-NEXT: v_or_b32_e32 v16, v17, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo @@ -7851,99 +7527,95 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX11-LABEL: v_fshr_v2i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v17, v16 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v17 +; GFX11-NEXT: v_bfi_b32 v25, v16, 0, 0x7f ; GFX11-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 ; GFX11-NEXT: v_lshlrev_b64 v[23:24], v25, v[0:1] ; GFX11-NEXT: v_and_b32_e32 v26, 0x7f, v16 ; GFX11-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[21:22], v25, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_cndmask_b32 v24, 0, v24 :: v_dual_add_nc_u32 v19, 0xffffffc0, v25 -; GFX11-NEXT: v_cndmask_b32_e32 v23, 0, v23, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v25 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v25 +; GFX11-NEXT: v_dual_cndmask_b32 v23, 0, v23 :: v_dual_cndmask_b32 v24, 0, v24 +; GFX11-NEXT: v_bfi_b32 v25, v20, 0, 0x7f ; GFX11-NEXT: v_or_b32_e32 v22, v18, v22 ; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v26 ; GFX11-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1] ; GFX11-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v20, 0x7f, v20 ; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_dual_cndmask_b32 v21, v0, v21 :: v_dual_cndmask_b32 v22, v1, v22 ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v16, v16, v18 ; GFX11-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v25 -; GFX11-NEXT: v_or_b32_e32 v17, v17, v19 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v21, v21, v2, s0 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v19 ; GFX11-NEXT: v_cndmask_b32_e64 v22, v22, v3, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v26 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11] ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11] ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 31, v5 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo -; GFX11-NEXT: v_not_b32_e32 v16, v20 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v26 +; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20 +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v16 :: v_dual_cndmask_b32 v1, v1, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v26, 0, v2, vcc_lo ; GFX11-NEXT: v_or_b32_e32 v6, v6, v10 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v17 :: v_dual_and_b32 v20, 0x7f, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v26, 0, v2 :: v_dual_and_b32 v25, 0x7f, v16 ; GFX11-NEXT: v_cndmask_b32_e32 v27, 0, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v8, s0 +; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v25 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 -; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20 +; GFX11-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5] ; GFX11-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v8, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[8:9], v25, v[6:7] ; GFX11-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v20 -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc_lo -; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25 ; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v25 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v20 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v8, v[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[8:9], v25, v[6:7] -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] -; GFX11-NEXT: v_lshrrev_b64 v[16:17], v20, v[12:13] -; GFX11-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v1, v24, v1 ; GFX11-NEXT: v_or_b32_e32 v8, v2, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25 ; GFX11-NEXT: v_or_b32_e32 v2, v21, v26 ; GFX11-NEXT: v_or_b32_e32 v9, v3, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v1, v24, v1 +; GFX11-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] +; GFX11-NEXT: v_lshrrev_b64 v[16:17], v20, v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v21, v4, v8, vcc_lo ; GFX11-NEXT: v_lshrrev_b64 v[3:4], v23, v[14:15] ; GFX11-NEXT: v_or_b32_e32 v8, v16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v16, v17, v19 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v6, v21, v6, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v8, s1 ; GFX11-NEXT: v_lshrrev_b64 v[8:9], v20, v[14:15] ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, v12, s2 ; GFX11-NEXT: v_or_b32_e32 v3, v22, v27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v12, v4, v13, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v8, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v9, s1 ; GFX11-NEXT: v_or_b32_e32 v4, v10, v5 -; GFX11-NEXT: v_or_b32_e32 v5, v11, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v5, v11, v12 ; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v7, v7, v9 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.illegal.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.illegal.ll index 3390ad2cf2a0..ab71f1f44b2c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.illegal.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.illegal.ll @@ -1,4 +1,4 @@ -; RUN: not llc -global-isel -global-isel-abort=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not llc -global-isel -global-isel-abort=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s ; FIXME: Should produce context error for each one ; ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(p5) = G_GLOBAL_VALUE @external_private (in function: fn_external_private) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index cae833b0d64e..0e1bbbd1ea92 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -123,9 +123,8 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 ; GFX8-NEXT: s_lshl_b32 s1, s1, s0 ; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX8-NEXT: v_bfi_b32 v2, s0, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, s1, v2 @@ -143,11 +142,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: s_lshl_b32 s0, s0, 4 ; GFX7-NEXT: s_lshl_b32 s1, s1, s0 ; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -302,9 +300,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_bfi_b32 v3, v0, 0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -319,9 +316,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: s_and_b32 s1, s4, 0xffff ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, s0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -393,9 +389,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX8-NEXT: v_bfi_b32 v2, v0, 0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 @@ -410,9 +405,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, s0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -482,12 +476,11 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_and_b32 s0, s2, 0xffff -; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_bfi_b32 v2, v1, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 @@ -505,11 +498,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -576,10 +568,9 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_bfi_b32 v3, s0, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -597,11 +588,10 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: s_lshl_b32 s0, s0, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -668,10 +658,9 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX8-NEXT: v_bfi_b32 v3, v1, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -689,11 +678,10 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -820,19 +808,18 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX8-LABEL: insertelement_v_v4i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s3, 1 ; GFX8-NEXT: s_lshr_b32 s0, s3, 1 +; GFX8-NEXT: s_and_b32 s1, s3, 1 ; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: s_lshl_b32 s2, s2, s1 ; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s1, v4 +; GFX8-NEXT: v_bfi_b32 v4, s1, 0, v4 ; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] @@ -846,19 +833,18 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s3, 1 ; GFX7-NEXT: s_lshr_b32 s0, s3, 1 +; GFX7-NEXT: s_and_b32 s1, s3, 1 ; GFX7-NEXT: s_and_b32 s2, s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: s_lshl_b32 s2, s2, s1 ; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX7-NEXT: v_bfi_b32 v2, s1, 0, v2 ; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -1090,8 +1076,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1117,8 +1102,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_not_b32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -1228,8 +1212,7 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_not_b32_e32 v1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1246,17 +1229,16 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX7-NEXT: v_not_b32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -1356,16 +1338,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX8-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX8-NEXT: s_and_b32 s0, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_bfi_b32 v2, v2, 0, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -1382,16 +1363,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_bfi_b32 v2, v2, 0, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -1479,15 +1459,14 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX8-NEXT: s_lshr_b32 s0, s2, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s1, v5 +; GFX8-NEXT: v_bfi_b32 v5, s1, 0, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -1501,19 +1480,18 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s2, 1 ; GFX7-NEXT: s_lshr_b32 s0, s2, 1 +; GFX7-NEXT: s_and_b32 s1, s2, 1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 ; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, s1, v3 +; GFX7-NEXT: v_bfi_b32 v3, s1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -1601,16 +1579,15 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX8-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX8-NEXT: v_not_b32_e32 v3, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, v6, v3 +; GFX8-NEXT: v_bfi_b32 v3, v3, 0, v6 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -1627,16 +1604,15 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_bfi_b32 v3, v3, 0, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -1910,14 +1886,13 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX8-LABEL: insertelement_v_v8i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_and_b32 s0, s3, 1 ; GFX8-NEXT: s_lshr_b32 s4, s3, 1 +; GFX8-NEXT: s_and_b32 s0, s3, 1 ; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshl_b32 s5, s1, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_not_b32 s6, s0 +; GFX8-NEXT: s_lshl_b32 s5, s1, s0 +; GFX8-NEXT: s_lshl_b32 s6, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 @@ -1926,7 +1901,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_bfi_b32 v6, s6, 0, v6 ; GFX8-NEXT: v_or_b32_e32 v6, s5, v6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] @@ -1942,14 +1917,13 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s0, s3, 1 ; GFX7-NEXT: s_lshr_b32 s4, s3, 1 +; GFX7-NEXT: s_and_b32 s0, s3, 1 ; GFX7-NEXT: s_and_b32 s1, s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: s_lshl_b32 s5, s1, s0 -; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s6, s0 +; GFX7-NEXT: s_lshl_b32 s5, s1, s0 +; GFX7-NEXT: s_lshl_b32 s6, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -1958,7 +1932,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX7-NEXT: v_bfi_b32 v4, s6, 0, v4 ; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] @@ -2263,17 +2237,16 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_and_b32 s4, s4, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 @@ -2294,23 +2267,22 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s4, s4, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX7-NEXT: v_not_b32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 @@ -2441,23 +2413,22 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX8-NEXT: v_not_b32_e32 v1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2 ; GFX8-NEXT: v_or_b32_e32 v6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2478,23 +2449,22 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX7-NEXT: v_not_b32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2 ; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2628,7 +2598,6 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 @@ -2636,7 +2605,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2 ; GFX8-NEXT: v_or_b32_e32 v9, v0, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc @@ -2658,9 +2627,8 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 @@ -2668,7 +2636,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc @@ -2773,13 +2741,12 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX8-LABEL: insertelement_v_v8i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_and_b32 s0, s2, 1 ; GFX8-NEXT: s_lshr_b32 s4, s2, 1 +; GFX8-NEXT: s_and_b32 s0, s2, 1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_not_b32 s5, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshl_b32 s5, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -2789,7 +2756,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX8-NEXT: v_bfi_b32 v1, s5, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] @@ -2805,14 +2772,13 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s0, s2, 1 ; GFX7-NEXT: s_lshr_b32 s4, s2, 1 +; GFX7-NEXT: s_and_b32 s0, s2, 1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s5, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s5, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -2821,7 +2787,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX7-NEXT: v_bfi_b32 v1, s5, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v7, v1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] @@ -2935,7 +2901,6 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 @@ -2943,7 +2908,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc @@ -2959,15 +2924,14 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 @@ -2975,7 +2939,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc @@ -3283,19 +3247,18 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-NEXT: s_and_b32 s0, s3, 1 +; GFX8-NEXT: s_lshr_b32 m0, s3, 1 ; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshr_b32 m0, s3, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, s0 ; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_movrels_b32_e32 v12, v0 -; GFX8-NEXT: v_and_b32_e32 v12, s0, v12 +; GFX8-NEXT: v_bfi_b32 v12, s0, 0, v12 ; GFX8-NEXT: v_or_b32_e32 v12, s1, v12 ; GFX8-NEXT: v_movreld_b32_e32 v0, v12 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -3310,17 +3273,16 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: s_and_b32 s0, s3, 1 +; GFX7-NEXT: s_lshr_b32 m0, s3, 1 ; GFX7-NEXT: s_and_b32 s1, s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: s_lshr_b32 m0, s3, 1 ; GFX7-NEXT: s_lshl_b32 s1, s1, s0 ; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_movrels_b32_e32 v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX7-NEXT: v_movreld_b32_e32 v2, v0 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 @@ -3644,21 +3606,20 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i ; GFX8-NEXT: v_mov_b32_e32 v6, s21 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v7, s22 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_and_b32 s4, s4, 0xffff -; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v9, s23 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v9, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 @@ -3705,20 +3666,19 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i ; GFX7-NEXT: v_mov_b32_e32 v6, s21 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_mov_b32_e32 v7, s22 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s4, s4, 0xffff +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_mov_b32_e32 v9, s23 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX7-NEXT: v_not_b32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v9, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-NEXT: v_mov_b32_e32 v1, s17 @@ -3936,20 +3896,19 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX8-NEXT: v_mov_b32_e32 v7, s17 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, s18 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v10, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX8-NEXT: v_not_b32_e32 v1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2 ; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 @@ -3996,20 +3955,19 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX7-NEXT: v_mov_b32_e32 v7, s17 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_mov_b32_e32 v9, s18 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_mov_b32_e32 v10, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX7-NEXT: v_not_b32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2 ; GFX7-NEXT: v_or_b32_e32 v9, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 @@ -4216,7 +4174,6 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v12, 0 @@ -4231,7 +4188,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX8-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2 ; GFX8-NEXT: v_or_b32_e32 v15, v0, v15 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v15, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc @@ -4263,9 +4220,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: s_mov_b32 s18, -1 @@ -4278,7 +4234,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr ; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] -; GFX7-NEXT: v_and_b32_e32 v1, v11, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v11 ; GFX7-NEXT: v_or_b32_e32 v11, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc @@ -4452,14 +4408,13 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va ; GFX8-NEXT: v_mov_b32_e32 v13, s0 ; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v11, 16 ; GFX8-NEXT: v_mov_b32_e32 v12, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_movrels_b32_e32 v13, v3 -; GFX8-NEXT: v_and_b32_e32 v13, s0, v13 +; GFX8-NEXT: v_bfi_b32 v13, s0, 0, v13 ; GFX8-NEXT: v_or_b32_e32 v2, v13, v2 ; GFX8-NEXT: v_movreld_b32_e32 v3, v2 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -4474,17 +4429,16 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: s_and_b32 s0, s2, 1 +; GFX7-NEXT: s_lshr_b32 m0, s2, 1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: s_lshr_b32 m0, s2, 1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_movrels_b32_e32 v1, v3 -; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_bfi_b32 v1, s0, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_movreld_b32_e32 v3, v0 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 @@ -4611,7 +4565,6 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v12, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v13, 0 @@ -4626,7 +4579,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX8-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3 ; GFX8-NEXT: v_or_b32_e32 v16, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v16, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc @@ -4654,13 +4607,12 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: s_mov_b32 s18, -1 @@ -4673,7 +4625,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v12, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll index fe7d421d27f8..4598bcc04a50 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -910,9 +910,8 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: s_lshl_b32 s1, s1, s0 ; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX8-NEXT: v_bfi_b32 v2, s0, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, s1, v2 @@ -930,11 +929,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: s_lshl_b32 s0, s0, 3 ; GFX7-NEXT: s_lshl_b32 s1, s1, s0 ; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -1089,9 +1087,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xff ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_bfi_b32 v3, v0, 0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1106,9 +1103,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_and_b32 s1, s4, 0xff ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, s0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -1180,9 +1176,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX8-NEXT: v_bfi_b32 v2, v0, 0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1197,9 +1192,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, s0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -1269,12 +1263,11 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_and_b32 s0, s2, 0xff -; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_bfi_b32 v2, v1, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1292,11 +1285,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -1363,10 +1355,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_bfi_b32 v3, s0, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1384,11 +1375,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_lshl_b32 s0, s0, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -1455,10 +1445,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_mov_b32_e32 v1, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX8-NEXT: v_bfi_b32 v3, v1, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1476,11 +1465,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1683,19 +1671,18 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX8-LABEL: insertelement_v_v8i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s3, 3 ; GFX8-NEXT: s_lshr_b32 s0, s3, 2 +; GFX8-NEXT: s_and_b32 s1, s3, 3 ; GFX8-NEXT: s_and_b32 s2, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: s_lshl_b32 s2, s2, s1 ; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s1, v4 +; GFX8-NEXT: v_bfi_b32 v4, s1, 0, v4 ; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] @@ -1709,19 +1696,18 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s3, 3 ; GFX7-NEXT: s_lshr_b32 s0, s3, 2 +; GFX7-NEXT: s_and_b32 s1, s3, 3 ; GFX7-NEXT: s_and_b32 s2, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: s_lshl_b32 s2, s2, s1 ; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX7-NEXT: v_bfi_b32 v2, s1, 0, v2 ; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -1953,8 +1939,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1980,8 +1965,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_not_b32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2091,8 +2075,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_not_b32_e32 v1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2109,17 +2092,16 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX7-NEXT: v_not_b32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2219,16 +2201,15 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_bfi_b32 v2, v2, 0, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2245,16 +2226,15 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX7-NEXT: s_and_b32 s0, s2, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xff, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_bfi_b32 v2, v2, 0, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2342,15 +2322,14 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: s_lshr_b32 s0, s2, 2 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s1, v5 +; GFX8-NEXT: v_bfi_b32 v5, s1, 0, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -2364,19 +2343,18 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s2, 3 ; GFX7-NEXT: s_lshr_b32 s0, s2, 2 +; GFX7-NEXT: s_and_b32 s1, s2, 3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 ; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, s1, v3 +; GFX7-NEXT: v_bfi_b32 v3, s1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -2464,16 +2442,15 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_mov_b32_e32 v6, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX8-NEXT: v_not_b32_e32 v3, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, v6, v3 +; GFX8-NEXT: v_bfi_b32 v3, v3, 0, v6 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2490,16 +2467,15 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xff, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_bfi_b32 v3, v3, 0, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2773,14 +2749,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX8-LABEL: insertelement_v_v16i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_and_b32 s0, s3, 3 ; GFX8-NEXT: s_lshr_b32 s4, s3, 2 +; GFX8-NEXT: s_and_b32 s0, s3, 3 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 -; GFX8-NEXT: s_lshl_b32 s5, s1, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_not_b32 s6, s0 +; GFX8-NEXT: s_lshl_b32 s5, s1, s0 +; GFX8-NEXT: s_lshl_b32 s6, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 @@ -2789,7 +2764,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_bfi_b32 v6, s6, 0, v6 ; GFX8-NEXT: v_or_b32_e32 v6, s5, v6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] @@ -2805,14 +2780,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s0, s3, 3 ; GFX7-NEXT: s_lshr_b32 s4, s3, 2 +; GFX7-NEXT: s_and_b32 s0, s3, 3 ; GFX7-NEXT: s_and_b32 s1, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: s_lshl_b32 s5, s1, s0 -; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s6, s0 +; GFX7-NEXT: s_lshl_b32 s5, s1, s0 +; GFX7-NEXT: s_lshl_b32 s6, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -2821,7 +2795,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX7-NEXT: v_bfi_b32 v4, s6, 0, v4 ; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] @@ -3126,17 +3100,16 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: v_mov_b32_e32 v3, 0xff -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 @@ -3157,23 +3130,22 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX7-NEXT: s_and_b32 s4, s4, 0xff ; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_and_b32 s4, s4, 0xff +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX7-NEXT: v_not_b32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 @@ -3304,23 +3276,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, 0xff ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 0xff +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX8-NEXT: v_not_b32_e32 v1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2 ; GFX8-NEXT: v_or_b32_e32 v6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3341,23 +3312,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX7-NEXT: v_not_b32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2 ; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -3491,7 +3461,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 @@ -3499,7 +3468,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2 ; GFX8-NEXT: v_or_b32_e32 v9, v0, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc @@ -3521,9 +3490,8 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 @@ -3531,7 +3499,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc @@ -3636,13 +3604,12 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX8-LABEL: insertelement_v_v16i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: s_lshr_b32 s4, s2, 2 +; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_not_b32 s5, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshl_b32 s5, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3652,7 +3619,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX8-NEXT: v_bfi_b32 v1, s5, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] @@ -3668,14 +3635,13 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s0, s2, 3 ; GFX7-NEXT: s_lshr_b32 s4, s2, 2 +; GFX7-NEXT: s_and_b32 s0, s2, 3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s5, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s5, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -3684,7 +3650,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX7-NEXT: v_bfi_b32 v1, s5, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v7, v1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] @@ -3798,7 +3764,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 @@ -3806,7 +3771,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc @@ -3822,15 +3787,14 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 2, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 3, v3 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 @@ -3838,7 +3802,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 51d0b225b2a2..533b25ef1a0c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GPRIDX %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s -; RUN: not --crash llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not --crash llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s ; FIXME: Need constant bus fixup pre-gfx10 for movrel ; ERR: Bad machine code: VOP* instruction violates constant bus restriction diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.class.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.class.s16.mir index 3ca3928fbfad..45a129283dfc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.class.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.class.s16.mir @@ -2,7 +2,7 @@ # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s -# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=SI-ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=SI-ERR %s # SI-ERR-NOT: remark # SI-ERR: remark: <unknown>:0:0: cannot select: %3:vcc(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.class), %2:sgpr(s16), %1:vgpr(s32) (in function: class_s16_vcc_sv) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.s16.mir index d6b8603bc2ae..94175c5f3037 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.s16.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=SI-ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=SI-ERR %s # SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.cos), %1:sgpr(s16) (in function: cos_s16_vs) # SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.cos), %1:vgpr(s16) (in function: cos_s16_vv) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.s16.mir index e2d2f1163047..5840f6255cb2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.s16.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=VI-ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=VI-ERR %s # VI-ERR-NOT: remark # VI-ERR: remark: <unknown>:0:0: cannot select: %6:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), %3:vgpr(s16), %4:vgpr(s16), %5:vgpr(s16) (in function: fmed3_s16_vvvv) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.s16.mir index 9feb4d831e07..64c4f875e971 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.s16.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=SI-ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=SI-ERR %s # SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), %1:sgpr(s16) (in function: fract_s16_vs) # SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), %1:vgpr(s16) (in function: fract_s16_vv) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.legacy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.legacy.mir index 9862d69e520c..32c018b8008e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.legacy.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.legacy.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=VI-ERR %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=VI-ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=VI-ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=VI-ERR %s # VI-ERR: remark: <unknown>:0:0: cannot select: %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp.legacy), %0:sgpr(s32) (in function: rcp_legacy_s32_vs) # VI-ERR-NEXT: remark: <unknown>:0:0: cannot select: %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp.legacy), %0:vgpr(s32) (in function: rcp_legacy_s32_vv) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.s16.mir index f9ec4364fd6f..1834177009c1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.s16.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=SI-ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=SI-ERR %s # SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %1:sgpr(s16) (in function: rcp_s16_vs) # SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %1:vgpr(s16) (in function: rcp_s16_vv) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.clamp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.clamp.mir index ebe238aae019..61b40d69b250 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.clamp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.clamp.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=VI-ERR %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=VI-ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=VI-ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=VI-ERR %s # VI-ERR: remark: <unknown>:0:0: cannot select: %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0:sgpr(s32) (in function: rsq_clamp_s32_vs) # VI-ERR-NEXT: remark: <unknown>:0:0: cannot select: %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0:vgpr(s32) (in function: rsq_clamp_s32_vv) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.legacy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.legacy.mir index 7fd3909405bc..b4baad9cb743 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.legacy.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.legacy.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=VI-ERR %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=VI-ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=VI-ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=VI-ERR %s # VI-ERR: remark: <unknown>:0:0: cannot select: %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.legacy), %0:sgpr(s32) (in function: rsq_legacy_s32_vs) # VI-ERR-NEXT: remark: <unknown>:0:0: cannot select: %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.legacy), %0:vgpr(s32) (in function: rsq_legacy_s32_vv) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.s16.mir index ac1ff73ce802..fce84c451847 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.s16.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=SI-ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=SI-ERR %s # SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %1:sgpr(s16) (in function: rsq_s16_vs) # SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %1:vgpr(s16) (in function: rsq_s16_vv) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.s16.mir index 0a9792f1807c..7ab374f5853a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.s16.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=SI-ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=SI-ERR %s # SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.sin), %1:sgpr(s16) (in function: sin_s16_vs) # SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.sin), %1:vgpr(s16) (in function: sin_s16_vv) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-packed.xfail.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-packed.xfail.mir index 132596d186a6..15933fad211a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-packed.xfail.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-packed.xfail.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s # Make sure v2s16 SALU operations fail to select diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax-64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax-64.mir new file mode 100644 index 000000000000..ace459979833 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax-64.mir @@ -0,0 +1,65 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: smax_s64_sv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr2_vgpr3 + ; GCN-LABEL: name: smax_s64_sv + ; GCN: liveins: $sgpr0_sgpr1, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; GCN-NEXT: [[V_MAX_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_I64_e64 [[COPY]], [[COPY1]], implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_I64_e64_]] + %0:sgpr(s64) = COPY $sgpr0_sgpr1 + %1:vgpr(s64) = COPY $vgpr2_vgpr3 + %2:vgpr(s64) = G_SMAX %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: smax_s64_vs +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $sgpr2_sgpr3 + ; GCN-LABEL: name: smax_s64_vs + ; GCN: liveins: $vgpr0_vgpr1, $sgpr2_sgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[V_MAX_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_I64_e64 [[COPY]], [[COPY1]], implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_I64_e64_]] + %0:sgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = COPY $sgpr2_sgpr3 + %2:vgpr(s64) = G_SMAX %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: smax_s64_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN-LABEL: name: smax_s64_vv + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; GCN-NEXT: [[V_MAX_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_I64_e64 [[COPY]], [[COPY1]], implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_I64_e64_]] + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = COPY $vgpr2_vgpr3 + %2:vgpr(s64) = G_SMAX %0, %1 + S_ENDPGM 0, implicit %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin-64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin-64.mir new file mode 100644 index 000000000000..f341bdfb22ab --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin-64.mir @@ -0,0 +1,65 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: smin_s64_sv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr2_vgpr3 + ; GCN-LABEL: name: smin_s64_sv + ; GCN: liveins: $sgpr0_sgpr1, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; GCN-NEXT: [[V_MIN_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_I64_e64 [[COPY]], [[COPY1]], implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_I64_e64_]] + %0:sgpr(s64) = COPY $sgpr0_sgpr1 + %1:vgpr(s64) = COPY $vgpr2_vgpr3 + %2:vgpr(s64) = G_SMIN %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: smin_s64_vs +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $sgpr2_sgpr3 + ; GCN-LABEL: name: smin_s64_vs + ; GCN: liveins: $vgpr0_vgpr1, $sgpr2_sgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[V_MIN_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_I64_e64 [[COPY]], [[COPY1]], implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_I64_e64_]] + %0:sgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = COPY $sgpr2_sgpr3 + %2:vgpr(s64) = G_SMIN %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: smin_s64_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN-LABEL: name: smin_s64_vv + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; GCN-NEXT: [[V_MIN_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_I64_e64 [[COPY]], [[COPY1]], implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_I64_e64_]] + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = COPY $vgpr2_vgpr3 + %2:vgpr(s64) = G_SMIN %0, %1 + S_ENDPGM 0, implicit %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir index 33f14c179f2a..2df27bdd459d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s --- name: smin_s32_ss diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-stacksave-stackrestore.invalid.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-stacksave-stackrestore.invalid.mir index b5f17dea5bb6..137f024f513a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-stacksave-stackrestore.invalid.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-stacksave-stackrestore.invalid.mir @@ -1,4 +1,4 @@ -# RUN: not llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=instruction-select -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s +# RUN: not llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=instruction-select -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s # ERR: LLVM ERROR: cannot select: G_STACKRESTORE %{{[0-9]+}}:vgpr(p5) (in function: stackrestore_waveaddress_vgpr) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax-64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax-64.mir new file mode 100644 index 000000000000..9edcf573c833 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax-64.mir @@ -0,0 +1,65 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: umax_s64_sv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr2_vgpr3 + ; GCN-LABEL: name: umax_s64_sv + ; GCN: liveins: $sgpr0_sgpr1, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; GCN-NEXT: [[V_MAX_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_U64_e64 [[COPY]], [[COPY1]], implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_U64_e64_]] + %0:sgpr(s64) = COPY $sgpr0_sgpr1 + %1:vgpr(s64) = COPY $vgpr2_vgpr3 + %2:vgpr(s64) = G_UMAX %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: umax_s64_vs +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $sgpr2_sgpr3 + ; GCN-LABEL: name: umax_s64_vs + ; GCN: liveins: $vgpr0_vgpr1, $sgpr2_sgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[V_MAX_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_U64_e64 [[COPY]], [[COPY1]], implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_U64_e64_]] + %0:sgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = COPY $sgpr2_sgpr3 + %2:vgpr(s64) = G_UMAX %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: umax_s64_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN-LABEL: name: umax_s64_vv + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; GCN-NEXT: [[V_MAX_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_U64_e64 [[COPY]], [[COPY1]], implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_U64_e64_]] + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = COPY $vgpr2_vgpr3 + %2:vgpr(s64) = G_UMAX %0, %1 + S_ENDPGM 0, implicit %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin-64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin-64.mir new file mode 100644 index 000000000000..e6c68112d067 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin-64.mir @@ -0,0 +1,65 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: umin_s64_sv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr2_vgpr3 + ; GCN-LABEL: name: umin_s64_sv + ; GCN: liveins: $sgpr0_sgpr1, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; GCN-NEXT: [[V_MIN_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_U64_e64 [[COPY]], [[COPY1]], implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_U64_e64_]] + %0:sgpr(s64) = COPY $sgpr0_sgpr1 + %1:vgpr(s64) = COPY $vgpr2_vgpr3 + %2:vgpr(s64) = G_UMIN %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: umin_s64_vs +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $sgpr2_sgpr3 + ; GCN-LABEL: name: umin_s64_vs + ; GCN: liveins: $vgpr0_vgpr1, $sgpr2_sgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[V_MIN_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_U64_e64 [[COPY]], [[COPY1]], implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_U64_e64_]] + %0:sgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = COPY $sgpr2_sgpr3 + %2:vgpr(s64) = G_UMIN %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: umin_s64_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN-LABEL: name: umin_s64_vv + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; GCN-NEXT: [[V_MIN_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_U64_e64 [[COPY]], [[COPY1]], implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_U64_e64_]] + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = COPY $vgpr2_vgpr3 + %2:vgpr(s64) = G_UMIN %0, %1 + S_ENDPGM 0, implicit %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll index fbec70d43b4d..f9d11cb23fa4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll @@ -66,7 +66,7 @@ define amdgpu_kernel void @asm_simple_agpr_clobber() { define i32 @asm_vgpr_early_clobber() { ; CHECK-LABEL: name: asm_vgpr_early_clobber ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %8, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 2031627 /* regdef-ec:VGPR_32 */, def early-clobber %8, 2031627 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] @@ -94,7 +94,7 @@ entry: define i32 @test_single_vgpr_output() nounwind { ; CHECK-LABEL: name: test_single_vgpr_output ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -106,7 +106,7 @@ entry: define i32 @test_single_sgpr_output_s32() nounwind { ; CHECK-LABEL: name: test_single_sgpr_output_s32 ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -119,7 +119,7 @@ entry: define float @test_multiple_register_outputs_same() #0 { ; CHECK-LABEL: name: test_multiple_register_outputs_same ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8, 2228234 /* regdef:VGPR_32 */, def %9 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %8, 2031626 /* regdef:VGPR_32 */, def %9 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]] @@ -136,7 +136,7 @@ define float @test_multiple_register_outputs_same() #0 { define double @test_multiple_register_outputs_mixed() #0 { ; CHECK-LABEL: name: test_multiple_register_outputs_mixed ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8, 3538954 /* regdef:VReg_64 */, def %9 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %8, 3670026 /* regdef:VReg_64 */, def %9 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY %9 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) @@ -171,7 +171,7 @@ define amdgpu_kernel void @test_input_vgpr_imm() { ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[C]](s32) - ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY1]] ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "v_mov_b32 v0, $0", "v"(i32 42) ret void @@ -185,7 +185,7 @@ define amdgpu_kernel void @test_input_sgpr_imm() { ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[C]](s32) - ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[COPY1]] ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "s_mov_b32 s0, $0", "s"(i32 42) ret void @@ -212,7 +212,7 @@ define float @test_input_vgpr(i32 %src) nounwind { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) - ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %9, 2228233 /* reguse:VGPR_32 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %9, 2031625 /* reguse:VGPR_32 */, [[COPY1]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -227,7 +227,7 @@ define i32 @test_memory_constraint(ptr addrspace(3) %a) nounwind { ; CHECK-NEXT: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 2228234 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3) + ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 2031626 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -244,7 +244,7 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind { ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32) - ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) + ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %11 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -256,13 +256,13 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind { define i32 @test_sgpr_matching_constraint() nounwind { ; CHECK-LABEL: name: test_sgpr_matching_constraint ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %10 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %10 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %10 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32) - ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %12, 2359305 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %12, 2621449 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY %12 ; CHECK-NEXT: $vgpr0 = COPY [[COPY4]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -285,7 +285,7 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind { ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32) - ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def %11, 2228234 /* regdef:VGPR_32 */, def %12, 2228234 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5) + ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %11, 2031626 /* regdef:VGPR_32 */, def %12, 2031626 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY %11 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY %12 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY %13 @@ -306,10 +306,10 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind { define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind { ; CHECK-LABEL: name: test_sgpr_to_vgpr_move_matching_constraint ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %10 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir index b7e52cadd8cd..d52b5e5370a0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*' -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*' -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s # Make sure incorrect usage of control flow intrinsics fails to select in case some transform separated the intrinsic from its branch. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir index 9716bb31db3f..4e8ab893c96f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=ERR %s # Make sure there's no crash if there is somehow no successor block. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-global.mir index 195ab02571bf..802f7f4946ee 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-global.mir @@ -2,7 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -O0 -run-pass=legalizer %s -o - | FileCheck %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -O0 -run-pass=legalizer %s -o - | FileCheck %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel.*' -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel.*' -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s # ERR: remark: <unknown>:0:0: unable to legalize instruction: %2:_(s32) = G_ATOMICRMW_FADD %0:_(p1), %1:_ :: (load store seq_cst (s32), addrspace 1) (in function: atomicrmw_fadd_global_i32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-local.mir index 0d3ee3f69ab2..d8f588ff53c9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-local.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel.*' -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel.*' -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel.*' -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel.*' -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s # ERR: remark: <unknown>:0:0: unable to legalize instruction: %2:_(s32) = G_ATOMICRMW_FADD %0:_(p3), %1:_ :: (load store seq_cst (s32), addrspace 3) (in function: atomicrmw_fadd_local_i32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-xchg-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-xchg-flat.mir index 22970d311a34..123580c22fbd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-xchg-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-xchg-flat.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -O0 -run-pass=legalizer -o - %s | FileCheck %s -# RUN: not llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERROR %s +# RUN: not llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR %s # ERROR: LLVM ERROR: unable to legalize instruction: %2:_(s32) = G_ATOMICRMW_XCHG %0:_(p0), %1:_ :: (load store seq_cst (s32)) (in function: atomicrmw_xchg_flat_i32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir index 0a15cc3824ae..5f610924a33c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir @@ -196,90 +196,49 @@ body: | ; SI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; SI-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 - ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] - ; SI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; SI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] - ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]] - ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16) - ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[ZEXT]](s32) - ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) - ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; SI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]] - ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32) - ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16) - ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[ZEXT1]](s32) - ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) - ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC]], [[TRUNC1]] - ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] - ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] - ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]] - ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16) - ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[ZEXT2]](s32) + ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; SI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; SI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C2]] + ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]] + ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[C3]](s32) + ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16) + ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[ZEXT]](s32) ; SI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) - ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY3]](s32) - ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16) - ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR4]], [[ZEXT3]](s32) - ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) - ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC2]], [[TRUNC3]] - ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32) - ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[BITCAST2]], [[COPY4]](s32) - ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR6]], [[COPY5]](s32) - ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[C1]](s32) - ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY6]], [[SHL4]] + ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16) + ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C4]] + ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[ZEXT1]](s32) + ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC2]], [[TRUNC3]] + ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]] + ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]] + ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]] + ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[COPY3]](s32) + ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16) + ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[ZEXT2]](s32) + ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16) + ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[ZEXT3]](s32) + ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC4]], [[TRUNC5]] + ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C]](s32) + ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL4]] ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) - ; SI-NEXT: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BITCAST3]] - ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>) - ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) - ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32) - ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) - ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]] - ; SI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C3]] - ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]] - ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16) - ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) - ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[ZEXT4]](s32) - ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]] - ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY8]](s32) - ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16) - ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[LSHR8]], [[ZEXT5]](s32) - ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32) - ; SI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC6]], [[TRUNC7]] - ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]] - ; SI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C3]] - ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]] - ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16) - ; SI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16) - ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT1]], [[ZEXT6]](s32) - ; SI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C5]] - ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY9]](s32) - ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16) - ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR10]], [[ZEXT7]](s32) - ; SI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32) - ; SI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC8]], [[TRUNC9]] - ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; SI-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) - ; SI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C1]](s32) - ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL7]] - ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) - ; SI-NEXT: $vgpr0 = COPY [[BITCAST5]](<2 x s16>) + ; SI-NEXT: $vgpr0 = COPY [[BITCAST3]](<2 x s16>) ; ; VI-LABEL: name: test_fshr_v2s16_v2s16 ; VI: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -287,68 +246,42 @@ body: | ; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; VI-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; VI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 - ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] - ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; VI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] - ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]] - ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16) - ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C]](s16) - ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[LSHR2]], [[AND1]](s16) - ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL]], [[LSHR3]] - ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] - ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] - ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]] - ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16) - ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C]](s16) - ; VI-NEXT: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[LSHR4]], [[AND3]](s16) - ; VI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL1]], [[LSHR5]] - ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) ; VI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) - ; VI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32) - ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) - ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC4]], [[C]](s16) - ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC5]], [[C]](s16) - ; VI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[C1]](s32) - ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[SHL4]] + ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C1]] + ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; VI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C2]] + ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]] + ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) + ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[SHL]], [[AND1]](s16) + ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[AND]](s16) + ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL1]], [[LSHR3]] + ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C1]] + ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C2]] + ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]] + ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) + ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[SHL2]], [[AND3]](s16) + ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[AND2]](s16) + ; VI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL3]], [[LSHR4]] + ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) - ; VI-NEXT: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BITCAST3]] - ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>) - ; VI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) - ; VI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32) - ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) - ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C2]] - ; VI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C3]] - ; VI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]] - ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[OR]], [[AND4]](s16) - ; VI-NEXT: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[SHL2]], [[C]](s16) - ; VI-NEXT: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[LSHR8]], [[AND5]](s16) - ; VI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL5]], [[LSHR9]] - ; VI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C2]] - ; VI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C3]] - ; VI-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]] - ; VI-NEXT: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[OR1]], [[AND6]](s16) - ; VI-NEXT: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[SHL3]], [[C]](s16) - ; VI-NEXT: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[LSHR10]], [[AND7]](s16) - ; VI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL6]], [[LSHR11]] - ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) - ; VI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) - ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL7]] - ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) - ; VI-NEXT: $vgpr0 = COPY [[BITCAST5]](<2 x s16>) + ; VI-NEXT: $vgpr0 = COPY [[BITCAST3]](<2 x s16>) ; ; GFX9-LABEL: name: test_fshr_v2s16_v2s16 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -699,136 +632,75 @@ body: | ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>) - ; SI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 - ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 - ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]] - ; SI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; SI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C1]], [[C3]] - ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]] - ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16) - ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[ZEXT]](s32) - ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) - ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; SI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C5]] - ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32) - ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16) - ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[ZEXT1]](s32) - ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) - ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC]], [[TRUNC1]] - ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]] - ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C1]], [[C3]] - ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]] - ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16) - ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[ZEXT2]](s32) - ; SI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) - ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY6]](s32) - ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16) - ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR4]], [[ZEXT3]](s32) - ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) - ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC2]], [[TRUNC3]] - ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[BITCAST2]], [[COPY7]](s32) - ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[COPY8]](s32) - ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; SI-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[C]](s32) - ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY9]], [[SHL4]] - ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) - ; SI-NEXT: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY4]], [[BITCAST4]] - ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>) - ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) - ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) - ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) - ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]] - ; SI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C3]] - ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]] - ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16) - ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) - ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[ZEXT4]](s32) - ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]] - ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY11]](s32) - ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16) - ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR7]], [[ZEXT5]](s32) - ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32) - ; SI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC6]], [[TRUNC7]] - ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]] - ; SI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C3]] - ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]] - ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16) - ; SI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16) - ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT1]], [[ZEXT6]](s32) - ; SI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C5]] - ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY12]](s32) - ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16) - ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[LSHR9]], [[ZEXT7]](s32) - ; SI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR10]](s32) - ; SI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC8]], [[TRUNC9]] - ; SI-NEXT: [[AND11:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]] - ; SI-NEXT: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C1]], [[C3]] - ; SI-NEXT: [[AND12:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]] - ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[AND11]](s16) - ; SI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[ZEXT8]](s32) - ; SI-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C5]] - ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND13]], [[COPY13]](s32) - ; SI-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16) - ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[LSHR11]], [[ZEXT9]](s32) - ; SI-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR12]](s32) - ; SI-NEXT: [[OR5:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC10]], [[TRUNC11]] - ; SI-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[BITCAST3]], [[COPY14]](s32) - ; SI-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; SI-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; SI-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C]](s32) - ; SI-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL9]] - ; SI-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32) - ; SI-NEXT: [[XOR6:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY5]], [[BITCAST6]] - ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[XOR6]](<2 x s16>) - ; SI-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32) - ; SI-NEXT: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C2]] - ; SI-NEXT: [[XOR7:%[0-9]+]]:_(s16) = G_XOR [[TRUNC12]], [[C3]] - ; SI-NEXT: [[AND15:%[0-9]+]]:_(s16) = G_AND [[XOR7]], [[C2]] - ; SI-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[AND14]](s16) - ; SI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[OR5]](s16) - ; SI-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT2]], [[ZEXT10]](s32) - ; SI-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; SI-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[AND16:%[0-9]+]]:_(s32) = G_AND [[SHL8]], [[C5]] - ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND16]], [[COPY17]](s32) - ; SI-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[AND15]](s16) - ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[LSHR13]], [[ZEXT11]](s32) - ; SI-NEXT: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR14]](s32) - ; SI-NEXT: [[OR7:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC13]], [[TRUNC14]] + ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>) + ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>) + ; SI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; SI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; SI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C2]] + ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]] + ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[C3]](s32) + ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16) + ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[ZEXT]](s32) + ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) + ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16) + ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C4]] + ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[ZEXT1]](s32) + ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC3]], [[TRUNC4]] + ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]] + ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]] + ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]] + ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[COPY6]](s32) + ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16) + ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[ZEXT2]](s32) + ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16) + ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[ZEXT3]](s32) + ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC5]], [[TRUNC6]] + ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]] + ; SI-NEXT: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C2]] + ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]] + ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[COPY7]](s32) + ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16) + ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[SHL4]], [[ZEXT4]](s32) + ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) + ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16) + ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C4]] + ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT5]](s32) + ; SI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) + ; SI-NEXT: [[OR2:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC7]], [[TRUNC8]] ; SI-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; SI-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; SI-NEXT: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) - ; SI-NEXT: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; SI-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; SI-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) - ; SI-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C]](s32) - ; SI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL11]] - ; SI-NEXT: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32) - ; SI-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; SI-NEXT: [[AND17:%[0-9]+]]:_(s32) = G_AND [[BITCAST8]], [[C5]] - ; SI-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[C]](s32) - ; SI-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL12]] - ; SI-NEXT: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR9]](s32) - ; SI-NEXT: [[AND18:%[0-9]+]]:_(s32) = G_AND [[BITCAST9]], [[C5]] - ; SI-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND18]], [[C]](s32) - ; SI-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[LSHR15]], [[SHL13]] - ; SI-NEXT: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR10]](s32) - ; SI-NEXT: $vgpr0 = COPY [[BITCAST10]](<2 x s16>) - ; SI-NEXT: $vgpr1 = COPY [[BITCAST11]](<2 x s16>) - ; SI-NEXT: $vgpr2 = COPY [[BITCAST12]](<2 x s16>) + ; SI-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C]](s32) + ; SI-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL6]] + ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST6]], [[C4]] + ; SI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32) + ; SI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL7]] + ; SI-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; SI-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C4]] + ; SI-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32) + ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR6]], [[SHL8]] + ; SI-NEXT: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; SI-NEXT: $vgpr0 = COPY [[BITCAST8]](<2 x s16>) + ; SI-NEXT: $vgpr1 = COPY [[BITCAST9]](<2 x s16>) + ; SI-NEXT: $vgpr2 = COPY [[BITCAST10]](<2 x s16>) ; ; VI-LABEL: name: test_fshr_v3s16_v3s16 ; VI: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 @@ -852,94 +724,59 @@ body: | ; VI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>) ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 - ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 - ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]] - ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; VI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C1]], [[C3]] - ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]] - ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16) - ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C1]](s16) - ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[LSHR2]], [[AND1]](s16) - ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL]], [[LSHR3]] - ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]] - ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C1]], [[C3]] - ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]] - ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16) - ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC4]], [[C1]](s16) - ; VI-NEXT: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[LSHR4]], [[AND3]](s16) - ; VI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL1]], [[LSHR5]] - ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C1]](s16) - ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC4]], [[C1]](s16) - ; VI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; VI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[C]](s32) - ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY6]], [[SHL4]] - ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) - ; VI-NEXT: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY4]], [[BITCAST4]] - ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>) - ; VI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) - ; VI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) - ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) - ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C2]] - ; VI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C3]] - ; VI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]] - ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[OR]], [[AND4]](s16) - ; VI-NEXT: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[SHL2]], [[C1]](s16) - ; VI-NEXT: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[LSHR7]], [[AND5]](s16) - ; VI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL5]], [[LSHR8]] - ; VI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C2]] - ; VI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C3]] - ; VI-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]] - ; VI-NEXT: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[OR1]], [[AND6]](s16) - ; VI-NEXT: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[SHL3]], [[C1]](s16) - ; VI-NEXT: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[LSHR9]], [[AND7]](s16) - ; VI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL6]], [[LSHR10]] - ; VI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]] - ; VI-NEXT: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C1]], [[C3]] - ; VI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]] - ; VI-NEXT: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[AND8]](s16) - ; VI-NEXT: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC5]], [[C1]](s16) - ; VI-NEXT: [[LSHR12:%[0-9]+]]:_(s16) = G_LSHR [[LSHR11]], [[AND9]](s16) - ; VI-NEXT: [[OR5:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL7]], [[LSHR12]] - ; VI-NEXT: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[TRUNC5]], [[C1]](s16) - ; VI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; VI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; VI-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[C]](s32) - ; VI-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[COPY7]], [[SHL9]] - ; VI-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32) - ; VI-NEXT: [[XOR6:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY5]], [[BITCAST6]] - ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[XOR6]](<2 x s16>) - ; VI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32) - ; VI-NEXT: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C2]] - ; VI-NEXT: [[XOR7:%[0-9]+]]:_(s16) = G_XOR [[TRUNC8]], [[C3]] - ; VI-NEXT: [[AND11:%[0-9]+]]:_(s16) = G_AND [[XOR7]], [[C2]] - ; VI-NEXT: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[OR5]], [[AND10]](s16) - ; VI-NEXT: [[LSHR13:%[0-9]+]]:_(s16) = G_LSHR [[SHL8]], [[C1]](s16) - ; VI-NEXT: [[LSHR14:%[0-9]+]]:_(s16) = G_LSHR [[LSHR13]], [[AND11]](s16) - ; VI-NEXT: [[OR7:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL10]], [[LSHR14]] + ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>) + ; VI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>) + ; VI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C1]] + ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; VI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C2]] + ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]] + ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) + ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[SHL]], [[AND1]](s16) + ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[AND]](s16) + ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL1]], [[LSHR3]] + ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C1]] + ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C2]] + ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]] + ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) + ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[SHL2]], [[AND3]](s16) + ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC4]], [[AND2]](s16) + ; VI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL3]], [[LSHR4]] + ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C1]] + ; VI-NEXT: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC8]], [[C2]] + ; VI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]] + ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) + ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[SHL4]], [[AND5]](s16) + ; VI-NEXT: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC5]], [[AND4]](s16) + ; VI-NEXT: [[OR2:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL5]], [[LSHR5]] ; VI-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; VI-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; VI-NEXT: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; VI-NEXT: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) - ; VI-NEXT: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) - ; VI-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) - ; VI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL11]] - ; VI-NEXT: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32) - ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) - ; VI-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[BITCAST8]], [[C4]] - ; VI-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C]](s32) - ; VI-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL12]] - ; VI-NEXT: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR9]](s32) - ; VI-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[BITCAST9]], [[C4]] - ; VI-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C]](s32) - ; VI-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[LSHR15]], [[SHL13]] - ; VI-NEXT: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR10]](s32) - ; VI-NEXT: $vgpr0 = COPY [[BITCAST10]](<2 x s16>) - ; VI-NEXT: $vgpr1 = COPY [[BITCAST11]](<2 x s16>) - ; VI-NEXT: $vgpr2 = COPY [[BITCAST12]](<2 x s16>) + ; VI-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; VI-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; VI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST6]], [[C4]] + ; VI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) + ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; VI-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; VI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C4]] + ; VI-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32) + ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR6]], [[SHL8]] + ; VI-NEXT: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; VI-NEXT: $vgpr0 = COPY [[BITCAST8]](<2 x s16>) + ; VI-NEXT: $vgpr1 = COPY [[BITCAST9]](<2 x s16>) + ; VI-NEXT: $vgpr2 = COPY [[BITCAST10]](<2 x s16>) ; ; GFX9-LABEL: name: test_fshr_v3s16_v3s16 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 @@ -1026,168 +863,87 @@ body: | ; SI-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; SI-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) ; SI-NEXT: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>) - ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 - ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] - ; SI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; SI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] - ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]] - ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16) - ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[ZEXT]](s32) - ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) - ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; SI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]] - ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32) - ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16) - ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[ZEXT1]](s32) - ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) - ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC]], [[TRUNC1]] - ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] - ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] - ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]] - ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16) - ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[ZEXT2]](s32) + ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; SI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; SI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C2]] + ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]] + ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[C3]](s32) + ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16) + ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[ZEXT]](s32) ; SI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) - ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY3]](s32) - ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16) - ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR4]], [[ZEXT3]](s32) - ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) - ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC2]], [[TRUNC3]] - ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32) - ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[BITCAST2]], [[COPY4]](s32) - ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR6]], [[COPY5]](s32) - ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[C1]](s32) - ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY6]], [[SHL4]] + ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16) + ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C4]] + ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[ZEXT1]](s32) + ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC2]], [[TRUNC3]] + ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]] + ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]] + ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]] + ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[COPY3]](s32) + ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16) + ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[ZEXT2]](s32) + ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16) + ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[ZEXT3]](s32) + ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC4]], [[TRUNC5]] + ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C]](s32) + ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL4]] ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) - ; SI-NEXT: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BITCAST3]] - ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>) - ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) - ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32) - ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) - ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]] - ; SI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C3]] - ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]] - ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16) - ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) - ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[ZEXT4]](s32) - ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]] - ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY8]](s32) - ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16) - ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[LSHR8]], [[ZEXT5]](s32) - ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32) - ; SI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC6]], [[TRUNC7]] - ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]] - ; SI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C3]] - ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]] - ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16) - ; SI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16) - ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT1]], [[ZEXT6]](s32) + ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; SI-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) + ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C1]] + ; SI-NEXT: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C2]] + ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]] + ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[BITCAST4]], [[COPY4]](s32) + ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16) + ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[SHL5]], [[ZEXT6]](s32) ; SI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C5]] - ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY9]](s32) - ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16) - ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR10]], [[ZEXT7]](s32) - ; SI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32) - ; SI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC8]], [[TRUNC9]] - ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; SI-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) - ; SI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C1]](s32) - ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL7]] - ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) - ; SI-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C1]](s32) - ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C1]](s32) - ; SI-NEXT: [[AND11:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] - ; SI-NEXT: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] - ; SI-NEXT: [[AND12:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]] - ; SI-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[AND11]](s16) - ; SI-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[BITCAST6]], [[ZEXT10]](s32) + ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16) + ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C4]] + ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT7]](s32) + ; SI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32) + ; SI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC8]], [[TRUNC9]] + ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C1]] + ; SI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C2]] + ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C1]] + ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; SI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LSHR5]], [[COPY5]](s32) + ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16) + ; SI-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[SHL7]], [[ZEXT8]](s32) ; SI-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; SI-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C5]] - ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[AND13]], [[COPY10]](s32) - ; SI-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16) - ; SI-NEXT: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[LSHR14]], [[ZEXT11]](s32) - ; SI-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR15]](s32) - ; SI-NEXT: [[OR6:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC10]], [[TRUNC11]] - ; SI-NEXT: [[AND14:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] - ; SI-NEXT: [[XOR6:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] - ; SI-NEXT: [[AND15:%[0-9]+]]:_(s16) = G_AND [[XOR6]], [[C2]] - ; SI-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[AND14]](s16) - ; SI-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[LSHR12]], [[ZEXT12]](s32) - ; SI-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[LSHR13]], [[COPY11]](s32) - ; SI-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[AND15]](s16) - ; SI-NEXT: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[LSHR16]], [[ZEXT13]](s32) - ; SI-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR17]](s32) - ; SI-NEXT: [[OR7:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC12]], [[TRUNC13]] - ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; SI-NEXT: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C1]](s32) - ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[BITCAST8]], [[COPY12]](s32) - ; SI-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LSHR18]], [[COPY13]](s32) - ; SI-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; SI-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; SI-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C1]](s32) - ; SI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[COPY14]], [[SHL12]] - ; SI-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32) - ; SI-NEXT: [[XOR7:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BITCAST9]] - ; SI-NEXT: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[XOR7]](<2 x s16>) - ; SI-NEXT: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST10]](s32) - ; SI-NEXT: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C1]](s32) - ; SI-NEXT: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR19]](s32) - ; SI-NEXT: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C2]] - ; SI-NEXT: [[XOR8:%[0-9]+]]:_(s16) = G_XOR [[TRUNC14]], [[C3]] - ; SI-NEXT: [[AND17:%[0-9]+]]:_(s16) = G_AND [[XOR8]], [[C2]] - ; SI-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[AND16]](s16) - ; SI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[OR6]](s16) - ; SI-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT2]], [[ZEXT14]](s32) - ; SI-NEXT: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) - ; SI-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[AND18:%[0-9]+]]:_(s32) = G_AND [[SHL10]], [[C5]] - ; SI-NEXT: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[AND18]], [[COPY16]](s32) - ; SI-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[AND17]](s16) - ; SI-NEXT: [[LSHR21:%[0-9]+]]:_(s32) = G_LSHR [[LSHR20]], [[ZEXT15]](s32) - ; SI-NEXT: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR21]](s32) - ; SI-NEXT: [[OR9:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC16]], [[TRUNC17]] - ; SI-NEXT: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C2]] - ; SI-NEXT: [[XOR9:%[0-9]+]]:_(s16) = G_XOR [[TRUNC15]], [[C3]] - ; SI-NEXT: [[AND20:%[0-9]+]]:_(s16) = G_AND [[XOR9]], [[C2]] - ; SI-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[AND19]](s16) - ; SI-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[OR7]](s16) - ; SI-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT3]], [[ZEXT16]](s32) - ; SI-NEXT: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) - ; SI-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[AND21:%[0-9]+]]:_(s32) = G_AND [[SHL11]], [[C5]] - ; SI-NEXT: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[AND21]], [[COPY17]](s32) - ; SI-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[AND20]](s16) - ; SI-NEXT: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[LSHR22]], [[ZEXT17]](s32) - ; SI-NEXT: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR23]](s32) - ; SI-NEXT: [[OR10:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC18]], [[TRUNC19]] - ; SI-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; SI-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) - ; SI-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT19]], [[C1]](s32) - ; SI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT18]], [[SHL15]] - ; SI-NEXT: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR11]](s32) - ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST11]](<2 x s16>) + ; SI-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16) + ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[ZEXT9]](s32) + ; SI-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32) + ; SI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC10]], [[TRUNC11]] + ; SI-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C]](s32) + ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL9]] + ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST7]](<2 x s16>) ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; VI-LABEL: name: test_fshr_v4s16_v4s16 @@ -1199,125 +955,73 @@ body: | ; VI-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; VI-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) ; VI-NEXT: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>) - ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) ; VI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 - ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] - ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; VI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] - ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]] - ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16) - ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C]](s16) - ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[LSHR2]], [[AND1]](s16) - ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL]], [[LSHR3]] - ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] - ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] - ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]] - ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16) - ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C]](s16) - ; VI-NEXT: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[LSHR4]], [[AND3]](s16) - ; VI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL1]], [[LSHR5]] - ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) - ; VI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32) - ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) - ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC4]], [[C]](s16) - ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC5]], [[C]](s16) - ; VI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[C1]](s32) - ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[SHL4]] + ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C1]] + ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; VI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C2]] + ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]] + ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) + ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[SHL]], [[AND1]](s16) + ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[AND]](s16) + ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL1]], [[LSHR3]] + ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C1]] + ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C2]] + ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]] + ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) + ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[SHL2]], [[AND3]](s16) + ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[AND2]](s16) + ; VI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL3]], [[LSHR4]] + ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) - ; VI-NEXT: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BITCAST3]] - ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>) + ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; VI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) - ; VI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32) - ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) - ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C2]] - ; VI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C3]] - ; VI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]] - ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[OR]], [[AND4]](s16) - ; VI-NEXT: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[SHL2]], [[C]](s16) - ; VI-NEXT: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[LSHR8]], [[AND5]](s16) - ; VI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL5]], [[LSHR9]] - ; VI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C2]] - ; VI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C3]] - ; VI-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]] - ; VI-NEXT: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[OR1]], [[AND6]](s16) - ; VI-NEXT: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[SHL3]], [[C]](s16) - ; VI-NEXT: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[LSHR10]], [[AND7]](s16) - ; VI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL6]], [[LSHR11]] - ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) - ; VI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) - ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL7]] - ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) - ; VI-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; VI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) - ; VI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C1]](s32) - ; VI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR12]](s32) - ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; VI-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32) - ; VI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C1]](s32) - ; VI-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR13]](s32) - ; VI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] - ; VI-NEXT: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] - ; VI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]] - ; VI-NEXT: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[TRUNC8]], [[AND8]](s16) - ; VI-NEXT: [[LSHR14:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC10]], [[C]](s16) - ; VI-NEXT: [[LSHR15:%[0-9]+]]:_(s16) = G_LSHR [[LSHR14]], [[AND9]](s16) - ; VI-NEXT: [[OR6:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL8]], [[LSHR15]] - ; VI-NEXT: [[AND10:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] - ; VI-NEXT: [[XOR6:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] - ; VI-NEXT: [[AND11:%[0-9]+]]:_(s16) = G_AND [[XOR6]], [[C2]] - ; VI-NEXT: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[TRUNC9]], [[AND10]](s16) - ; VI-NEXT: [[LSHR16:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC11]], [[C]](s16) - ; VI-NEXT: [[LSHR17:%[0-9]+]]:_(s16) = G_LSHR [[LSHR16]], [[AND11]](s16) - ; VI-NEXT: [[OR7:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL9]], [[LSHR17]] - ; VI-NEXT: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; VI-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST8]](s32) - ; VI-NEXT: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C1]](s32) - ; VI-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR18]](s32) - ; VI-NEXT: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[TRUNC12]], [[C]](s16) - ; VI-NEXT: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[TRUNC13]], [[C]](s16) - ; VI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; VI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; VI-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C1]](s32) - ; VI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[COPY4]], [[SHL12]] - ; VI-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32) - ; VI-NEXT: [[XOR7:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BITCAST9]] - ; VI-NEXT: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[XOR7]](<2 x s16>) - ; VI-NEXT: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST10]](s32) - ; VI-NEXT: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C1]](s32) - ; VI-NEXT: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR19]](s32) - ; VI-NEXT: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C2]] - ; VI-NEXT: [[XOR8:%[0-9]+]]:_(s16) = G_XOR [[TRUNC14]], [[C3]] - ; VI-NEXT: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR8]], [[C2]] - ; VI-NEXT: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[OR6]], [[AND12]](s16) - ; VI-NEXT: [[LSHR20:%[0-9]+]]:_(s16) = G_LSHR [[SHL10]], [[C]](s16) - ; VI-NEXT: [[LSHR21:%[0-9]+]]:_(s16) = G_LSHR [[LSHR20]], [[AND13]](s16) - ; VI-NEXT: [[OR9:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL13]], [[LSHR21]] - ; VI-NEXT: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C2]] - ; VI-NEXT: [[XOR9:%[0-9]+]]:_(s16) = G_XOR [[TRUNC15]], [[C3]] - ; VI-NEXT: [[AND15:%[0-9]+]]:_(s16) = G_AND [[XOR9]], [[C2]] - ; VI-NEXT: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[OR7]], [[AND14]](s16) - ; VI-NEXT: [[LSHR22:%[0-9]+]]:_(s16) = G_LSHR [[SHL11]], [[C]](s16) - ; VI-NEXT: [[LSHR23:%[0-9]+]]:_(s16) = G_LSHR [[LSHR22]], [[AND15]](s16) - ; VI-NEXT: [[OR10:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL14]], [[LSHR23]] - ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) - ; VI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) - ; VI-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C1]](s32) - ; VI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL15]] - ; VI-NEXT: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR11]](s32) - ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST11]](<2 x s16>) + ; VI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) + ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; VI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; VI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) + ; VI-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; VI-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) + ; VI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C1]] + ; VI-NEXT: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC10]], [[C2]] + ; VI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]] + ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[TRUNC6]], [[C3]](s16) + ; VI-NEXT: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[SHL5]], [[AND5]](s16) + ; VI-NEXT: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC8]], [[AND4]](s16) + ; VI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL6]], [[LSHR8]] + ; VI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C1]] + ; VI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC11]], [[C2]] + ; VI-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C1]] + ; VI-NEXT: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[TRUNC7]], [[C3]](s16) + ; VI-NEXT: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[SHL7]], [[AND7]](s16) + ; VI-NEXT: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC9]], [[AND6]](s16) + ; VI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL8]], [[LSHR9]] + ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32) + ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST7]](<2 x s16>) ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX9-LABEL: name: test_fshr_v4s16_v4s16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir index b43c18ee2aa3..80737815cc16 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: not llc -mtriple=amdgcn -run-pass=legalizer -o /dev/null %s 2>&1 | FileCheck %s +# RUN: not llc -mtriple=amdgcn -run-pass=legalizer -filetype=null %s 2>&1 | FileCheck %s # CHECK: LLVM ERROR: unable to legalize instruction: %3:_(p0) = G_JUMP_TABLE %jump-table.0 (in function: jt_test) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir index aebda3f28d5f..cbd9c2173b7e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX67 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX67 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s --- name: s_buffer_load_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir index db11855d2967..45714fd99d7b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir @@ -4,6 +4,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX1250 %s --- name: test_smax_s32 @@ -34,6 +35,14 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[COPY1]] ; GFX9-NEXT: $vgpr0 = COPY [[SMAX]](s32) + ; + ; GFX1250-LABEL: name: test_smax_s32 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[COPY1]] + ; GFX1250-NEXT: $vgpr0 = COPY [[SMAX]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_SMAX %0, %1 @@ -72,6 +81,14 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s64), [[COPY1]] ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) + ; + ; GFX1250-LABEL: name: test_smax_s64 + ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX [[COPY]], [[COPY1]] + ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[SMAX]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(s64) = G_SMAX %0, %1 @@ -115,6 +132,17 @@ body: | ; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]] ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; GFX1250-LABEL: name: test_smax_s16 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]] + ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX]](s16) + ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -165,6 +193,19 @@ body: | ; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]] ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; GFX1250-LABEL: name: test_smax_s8 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8 + ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32) + ; GFX1250-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8 + ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG1]](s32) + ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]] + ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX]](s16) + ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s8) = G_TRUNC %0 @@ -209,6 +250,16 @@ body: | ; GFX9-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17 ; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]] ; GFX9-NEXT: $vgpr0 = COPY [[SMAX]](s32) + ; + ; GFX1250-LABEL: name: test_smax_s17 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 17 + ; GFX1250-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17 + ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]] + ; GFX1250-NEXT: $vgpr0 = COPY [[SMAX]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s17) = G_TRUNC %0 @@ -259,6 +310,18 @@ body: | ; GFX9-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[UV3]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; + ; GFX1250-LABEL: name: test_smax_v2s32 + ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[UV]], [[UV2]] + ; GFX1250-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[UV3]] + ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32) + ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(<2 x s32>) = G_SMAX %0, %1 @@ -309,6 +372,19 @@ body: | ; GFX9-NEXT: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[UV2]], [[UV5]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32), [[SMAX2]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX1250-LABEL: name: test_smax_v3s32 + ; GFX1250: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) + ; GFX1250-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) + ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[UV]], [[UV3]] + ; GFX1250-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[UV4]] + ; GFX1250-NEXT: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[UV2]], [[UV5]] + ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32), [[SMAX2]](s32) + ; GFX1250-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 %2:_(<3 x s32>) = G_SMAX %0, %1 @@ -375,6 +451,14 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[COPY]], [[COPY1]] ; GFX9-NEXT: $vgpr0 = COPY [[SMAX]](<2 x s16>) + ; + ; GFX1250-LABEL: name: test_smax_v2s16 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[COPY]], [[COPY1]] + ; GFX1250-NEXT: $vgpr0 = COPY [[SMAX]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_SMAX %0, %1 @@ -461,6 +545,26 @@ body: | ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX1]](s16) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32) ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX1250-LABEL: name: test_smax_v3s16 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) + ; GFX1250-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) + ; GFX1250-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV]], [[UV2]] + ; GFX1250-NEXT: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]] + ; GFX1250-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[SMAX]](<2 x s16>) + ; GFX1250-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX1]](s16) + ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32) + ; GFX1250-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_SMAX %0, %1 @@ -568,6 +672,18 @@ body: | ; GFX9-NEXT: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV1]], [[UV3]] ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMAX]](<2 x s16>), [[SMAX1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; + ; GFX1250-LABEL: name: test_smax_v4s16 + ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV]], [[UV2]] + ; GFX1250-NEXT: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV1]], [[UV3]] + ; GFX1250-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMAX]](<2 x s16>), [[SMAX1]](<2 x s16>) + ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 %2:_(<4 x s16>) = G_SMAX %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir index d366242db087..88fe5d0d5433 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir @@ -4,6 +4,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX1250 %s --- name: test_smin_s32 @@ -34,6 +35,14 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[COPY1]] ; GFX9-NEXT: $vgpr0 = COPY [[SMIN]](s32) + ; + ; GFX1250-LABEL: name: test_smin_s32 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[COPY1]] + ; GFX1250-NEXT: $vgpr0 = COPY [[SMIN]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_SMIN %0, %1 @@ -72,6 +81,14 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]] ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) + ; + ; GFX1250-LABEL: name: test_smin_s64 + ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s64) = G_SMIN [[COPY]], [[COPY1]] + ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[SMIN]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(s64) = G_SMIN %0, %1 @@ -115,6 +132,17 @@ body: | ; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]] ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; GFX1250-LABEL: name: test_smin_s16 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]] + ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN]](s16) + ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -165,6 +193,19 @@ body: | ; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]] ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; GFX1250-LABEL: name: test_smin_s8 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8 + ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32) + ; GFX1250-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8 + ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG1]](s32) + ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]] + ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN]](s16) + ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s8) = G_TRUNC %0 @@ -209,6 +250,16 @@ body: | ; GFX9-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17 ; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]] ; GFX9-NEXT: $vgpr0 = COPY [[SMIN]](s32) + ; + ; GFX1250-LABEL: name: test_smin_s17 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 17 + ; GFX1250-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17 + ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]] + ; GFX1250-NEXT: $vgpr0 = COPY [[SMIN]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s17) = G_TRUNC %0 @@ -259,6 +310,18 @@ body: | ; GFX9-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[UV3]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; + ; GFX1250-LABEL: name: test_smin_v2s32 + ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[UV]], [[UV2]] + ; GFX1250-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[UV3]] + ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32) + ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(<2 x s32>) = G_SMIN %0, %1 @@ -309,6 +372,19 @@ body: | ; GFX9-NEXT: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[UV2]], [[UV5]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32), [[SMIN2]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX1250-LABEL: name: test_smin_v3s32 + ; GFX1250: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) + ; GFX1250-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) + ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[UV]], [[UV3]] + ; GFX1250-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[UV4]] + ; GFX1250-NEXT: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[UV2]], [[UV5]] + ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32), [[SMIN2]](s32) + ; GFX1250-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 %2:_(<3 x s32>) = G_SMIN %0, %1 @@ -375,6 +451,14 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[COPY]], [[COPY1]] ; GFX9-NEXT: $vgpr0 = COPY [[SMIN]](<2 x s16>) + ; + ; GFX1250-LABEL: name: test_smin_v2s16 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[COPY]], [[COPY1]] + ; GFX1250-NEXT: $vgpr0 = COPY [[SMIN]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_SMIN %0, %1 @@ -461,6 +545,26 @@ body: | ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN1]](s16) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32) ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX1250-LABEL: name: test_smin_v3s16 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) + ; GFX1250-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) + ; GFX1250-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV]], [[UV2]] + ; GFX1250-NEXT: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]] + ; GFX1250-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[SMIN]](<2 x s16>) + ; GFX1250-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN1]](s16) + ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32) + ; GFX1250-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_SMIN %0, %1 @@ -568,6 +672,18 @@ body: | ; GFX9-NEXT: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV1]], [[UV3]] ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMIN]](<2 x s16>), [[SMIN1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; + ; GFX1250-LABEL: name: test_smin_v4s16 + ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV]], [[UV2]] + ; GFX1250-NEXT: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV1]], [[UV3]] + ; GFX1250-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMIN]](<2 x s16>), [[SMIN1]](<2 x s16>) + ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 %2:_(<4 x s16>) = G_SMIN %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir index e8fa4e9d822f..32b526e28912 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir @@ -4,6 +4,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX1250 %s --- name: test_umax_s32 @@ -34,6 +35,14 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[COPY]], [[COPY1]] ; GFX9-NEXT: $vgpr0 = COPY [[UMAX]](s32) + ; + ; GFX1250-LABEL: name: test_umax_s32 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[COPY]], [[COPY1]] + ; GFX1250-NEXT: $vgpr0 = COPY [[UMAX]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_UMAX %0, %1 @@ -72,6 +81,14 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]] ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) + ; + ; GFX1250-LABEL: name: test_umax_s64 + ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s64) = G_UMAX [[COPY]], [[COPY1]] + ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[UMAX]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(s64) = G_UMAX %0, %1 @@ -116,6 +133,17 @@ body: | ; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[TRUNC]], [[TRUNC1]] ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; GFX1250-LABEL: name: test_umax_s16 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[TRUNC]], [[TRUNC1]] + ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX]](s16) + ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -169,6 +197,20 @@ body: | ; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[AND]], [[AND1]] ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; GFX1250-LABEL: name: test_umax_s8 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX1250-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]] + ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX1250-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]] + ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[AND]], [[AND1]] + ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX]](s16) + ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s8) = G_TRUNC %0 @@ -216,6 +258,17 @@ body: | ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] ; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]] ; GFX9-NEXT: $vgpr0 = COPY [[UMAX]](s32) + ; + ; GFX1250-LABEL: name: test_umax_s17 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 131071 + ; GFX1250-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; GFX1250-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] + ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]] + ; GFX1250-NEXT: $vgpr0 = COPY [[UMAX]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s17) = G_TRUNC %0 @@ -266,6 +319,18 @@ body: | ; GFX9-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[UV1]], [[UV3]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; + ; GFX1250-LABEL: name: test_umax_v2s32 + ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[UV]], [[UV2]] + ; GFX1250-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[UV1]], [[UV3]] + ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32) + ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(<2 x s32>) = G_UMAX %0, %1 @@ -316,6 +381,19 @@ body: | ; GFX9-NEXT: [[UMAX2:%[0-9]+]]:_(s32) = G_UMAX [[UV2]], [[UV5]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32), [[UMAX2]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX1250-LABEL: name: test_umax_v3s32 + ; GFX1250: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) + ; GFX1250-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) + ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[UV]], [[UV3]] + ; GFX1250-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[UV1]], [[UV4]] + ; GFX1250-NEXT: [[UMAX2:%[0-9]+]]:_(s32) = G_UMAX [[UV2]], [[UV5]] + ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32), [[UMAX2]](s32) + ; GFX1250-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 %2:_(<3 x s32>) = G_UMAX %0, %1 @@ -378,6 +456,14 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[COPY]], [[COPY1]] ; GFX9-NEXT: $vgpr0 = COPY [[UMAX]](<2 x s16>) + ; + ; GFX1250-LABEL: name: test_umax_v2s16 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[COPY]], [[COPY1]] + ; GFX1250-NEXT: $vgpr0 = COPY [[UMAX]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_UMAX %0, %1 @@ -463,6 +549,26 @@ body: | ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX1]](s16) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32) ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX1250-LABEL: name: test_umax_v3s16 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) + ; GFX1250-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) + ; GFX1250-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV]], [[UV2]] + ; GFX1250-NEXT: [[UMAX1:%[0-9]+]]:_(s16) = G_UMAX [[TRUNC]], [[TRUNC1]] + ; GFX1250-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UMAX]](<2 x s16>) + ; GFX1250-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX1]](s16) + ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32) + ; GFX1250-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_UMAX %0, %1 @@ -562,6 +668,18 @@ body: | ; GFX9-NEXT: [[UMAX1:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV1]], [[UV3]] ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMAX]](<2 x s16>), [[UMAX1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; + ; GFX1250-LABEL: name: test_umax_v4s16 + ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV]], [[UV2]] + ; GFX1250-NEXT: [[UMAX1:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV1]], [[UV3]] + ; GFX1250-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMAX]](<2 x s16>), [[UMAX1]](<2 x s16>) + ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 %2:_(<4 x s16>) = G_UMAX %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir index 8ee0df5ce670..8666c29c99d1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir @@ -4,6 +4,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX1250 %s --- name: test_umin_s32 @@ -34,6 +35,14 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]] ; GFX9-NEXT: $vgpr0 = COPY [[UMIN]](s32) + ; + ; GFX1250-LABEL: name: test_umin_s32 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]] + ; GFX1250-NEXT: $vgpr0 = COPY [[UMIN]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_UMIN %0, %1 @@ -72,6 +81,14 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]] ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) + ; + ; GFX1250-LABEL: name: test_umin_s64 + ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s64) = G_UMIN [[COPY]], [[COPY1]] + ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[UMIN]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(s64) = G_UMIN %0, %1 @@ -116,6 +133,17 @@ body: | ; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]] ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; GFX1250-LABEL: name: test_umin_s16 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]] + ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN]](s16) + ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -169,6 +197,20 @@ body: | ; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[AND]], [[AND1]] ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; GFX1250-LABEL: name: test_umin_s8 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX1250-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]] + ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX1250-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]] + ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[AND]], [[AND1]] + ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN]](s16) + ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s8) = G_TRUNC %0 @@ -216,6 +258,17 @@ body: | ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] ; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AND]], [[AND1]] ; GFX9-NEXT: $vgpr0 = COPY [[UMIN]](s32) + ; + ; GFX1250-LABEL: name: test_umin_s17 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 131071 + ; GFX1250-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; GFX1250-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] + ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AND]], [[AND1]] + ; GFX1250-NEXT: $vgpr0 = COPY [[UMIN]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s17) = G_TRUNC %0 @@ -266,6 +319,18 @@ body: | ; GFX9-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; + ; GFX1250-LABEL: name: test_umin_v2s32 + ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[UV]], [[UV2]] + ; GFX1250-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]] + ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32) + ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(<2 x s32>) = G_UMIN %0, %1 @@ -316,6 +381,19 @@ body: | ; GFX9-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[UV2]], [[UV5]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32), [[UMIN2]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX1250-LABEL: name: test_umin_v3s32 + ; GFX1250: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) + ; GFX1250-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) + ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[UV]], [[UV3]] + ; GFX1250-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV4]] + ; GFX1250-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[UV2]], [[UV5]] + ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32), [[UMIN2]](s32) + ; GFX1250-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 %2:_(<3 x s32>) = G_UMIN %0, %1 @@ -378,6 +456,14 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[COPY]], [[COPY1]] ; GFX9-NEXT: $vgpr0 = COPY [[UMIN]](<2 x s16>) + ; + ; GFX1250-LABEL: name: test_umin_v2s16 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[COPY]], [[COPY1]] + ; GFX1250-NEXT: $vgpr0 = COPY [[UMIN]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_UMIN %0, %1 @@ -463,6 +549,26 @@ body: | ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN1]](s16) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32) ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX1250-LABEL: name: test_umin_v3s16 + ; GFX1250: liveins: $vgpr0, $vgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) + ; GFX1250-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) + ; GFX1250-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV]], [[UV2]] + ; GFX1250-NEXT: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]] + ; GFX1250-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UMIN]](<2 x s16>) + ; GFX1250-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN1]](s16) + ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32) + ; GFX1250-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_UMIN %0, %1 @@ -562,6 +668,18 @@ body: | ; GFX9-NEXT: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV1]], [[UV3]] ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMIN]](<2 x s16>), [[UMIN1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; + ; GFX1250-LABEL: name: test_umin_v4s16 + ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 + ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV]], [[UV2]] + ; GFX1250-NEXT: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV1]], [[UV3]] + ; GFX1250-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMIN]](<2 x s16>), [[UMIN1]](<2 x s16>) + ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 %2:_(<4 x s16>) = G_UMIN %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll index 7916267c6eca..800df8987703 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -o - < %s | FileCheck %s --check-prefixes=GFX,GFX6 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -o - < %s | FileCheck %s --check-prefixes=GFX,GFX8 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX10 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX1250 declare i16 @llvm.abs.i16(i16, i1) declare i32 @llvm.abs.i32(i32, i1) @@ -13,11 +14,30 @@ declare <3 x i16> @llvm.abs.v3i16(<3 x i16>, i1) declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) { -; GFX-LABEL: abs_sgpr_i16: -; GFX: ; %bb.0: -; GFX-NEXT: s_sext_i32_i16 s0, s0 -; GFX-NEXT: s_abs_i32 s0, s0 -; GFX-NEXT: ; return to shader part epilog +; GFX6-LABEL: abs_sgpr_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_sext_i32_i16 s0, s0 +; GFX6-NEXT: s_abs_i32 s0, s0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: abs_sgpr_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_abs_i32 s0, s0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: abs_sgpr_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sext_i32_i16 s0, s0 +; GFX10-NEXT: s_abs_i32 s0, s0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_sgpr_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_sext_i32_i16 s0, s0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_abs_i32 s0, s0 +; GFX1250-NEXT: ; return to shader part epilog %res = call i16 @llvm.abs.i16(i16 %arg, i1 false) ret i16 %res } @@ -32,14 +52,42 @@ define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) { } define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) { -; GFX-LABEL: abs_sgpr_i64: -; GFX: ; %bb.0: -; GFX-NEXT: s_ashr_i32 s2, s1, 31 -; GFX-NEXT: s_add_u32 s0, s0, s2 -; GFX-NEXT: s_mov_b32 s3, s2 -; GFX-NEXT: s_addc_u32 s1, s1, s2 -; GFX-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX-NEXT: ; return to shader part epilog +; GFX6-LABEL: abs_sgpr_i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_ashr_i32 s2, s1, 31 +; GFX6-NEXT: s_add_u32 s0, s0, s2 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s1, s1, s2 +; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: abs_sgpr_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_ashr_i32 s2, s1, 31 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_mov_b32 s3, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s2 +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: abs_sgpr_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_ashr_i32 s2, s1, 31 +; GFX10-NEXT: s_add_u32 s0, s0, s2 +; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_addc_u32 s1, s1, s2 +; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_sgpr_i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_ashr_i32 s2, s1, 31 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_mov_b32 s3, s2 +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX1250-NEXT: ; return to shader part epilog %res = call i64 @llvm.abs.i64(i64 %arg, i1 false) ret i64 %res } @@ -78,6 +126,14 @@ define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) { ; GFX10-NEXT: v_max_i16 v0, v0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_vgpr_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_i16 v0, v0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: ; return to shader part epilog %res = call i16 @llvm.abs.i16(i16 %arg, i1 false) ret i16 %res } @@ -103,6 +159,14 @@ define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) { ; GFX10-NEXT: v_max_i32_e32 v0, v0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_vgpr_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_i32_e32 v0, v0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: ; return to shader part epilog %res = call i32 @llvm.abs.i32(i32 %arg, i1 false) ret i32 %res } @@ -140,6 +204,20 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) { ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_vgpr_i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX1250-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1250-NEXT: ; return to shader part epilog %res = call i64 @llvm.abs.i64(i64 %arg, i1 false) ret i64 %res } @@ -192,6 +270,24 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_vgpr_v4i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1 +; GFX1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_max_i32_e32 v0, v0, v4 +; GFX1250-NEXT: v_max_i32_e32 v1, v1, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_max_i32_e32 v2, v2, v6 +; GFX1250-NEXT: v_max_i32_e32 v3, v3, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1250-NEXT: ; return to shader part epilog %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false) ret <4 x i32> %res } @@ -243,6 +339,21 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) { ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_vgpr_v2i8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_sub_nc_u16 v2, 0, v0 +; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_max_i16 v0, v0, v2 +; GFX1250-NEXT: v_max_i16 v1, v1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1250-NEXT: ; return to shader part epilog %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false) ret <2 x i8> %res } @@ -307,6 +418,27 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_vgpr_v3i8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v0 +; GFX1250-NEXT: v_sub_nc_u16 v4, 0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_sub_nc_u16 v5, 0, v2 +; GFX1250-NEXT: v_max_i16 v0, v0, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_max_i16 v1, v1, v4 +; GFX1250-NEXT: v_max_i16 v2, v2, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1250-NEXT: ; return to shader part epilog %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false) ret <3 x i8> %res } @@ -341,6 +473,16 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) { ; GFX10-NEXT: s_abs_i32 s0, s0 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s1, s0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_sgpr_v2i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_sext_i32_i16 s1, s0 +; GFX1250-NEXT: s_ashr_i32 s0, s0, 16 +; GFX1250-NEXT: s_abs_i32 s1, s1 +; GFX1250-NEXT: s_abs_i32 s0, s0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX1250-NEXT: ; return to shader part epilog %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false) ret <2 x i16> %res } @@ -375,6 +517,14 @@ define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) { ; GFX10-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_vgpr_v2i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_pk_sub_i16 v1, 0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_i16 v0, v0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: ; return to shader part epilog %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false) ret <2 x i16> %res } @@ -416,6 +566,17 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) { ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0 ; GFX10-NEXT: s_abs_i32 s1, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_sgpr_v3i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_sext_i32_i16 s2, s0 +; GFX1250-NEXT: s_ashr_i32 s0, s0, 16 +; GFX1250-NEXT: s_abs_i32 s2, s2 +; GFX1250-NEXT: s_abs_i32 s0, s0 +; GFX1250-NEXT: s_sext_i32_i16 s1, s1 +; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX1250-NEXT: s_abs_i32 s1, s1 +; GFX1250-NEXT: ; return to shader part epilog %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false) ret <3 x i16> %res } @@ -460,6 +621,18 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_vgpr_v3i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0 +; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2 +; GFX1250-NEXT: v_max_i16 v1, v1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1250-NEXT: ; return to shader part epilog %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false) ret <3 x i16> %res } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll index a09703285087..bd6634f25077 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll @@ -358,12 +358,12 @@ main_body: define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 { ; GFX11-TRUE16-LABEL: v_interp_f16_imm_params: ; GFX11-TRUE16: ; %bb.0: ; %main_body -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7 -; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7 +; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v2, v3 wait_exp:7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l @@ -383,12 +383,12 @@ define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) # ; ; GFX12-TRUE16-LABEL: v_interp_f16_imm_params: ; GFX12-TRUE16: ; %bb.0: ; %main_body -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7 -; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7 +; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v2, v3 wait_exp:7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index 2b595b9bbecc..b0ca1e8ef3df 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1030 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1013 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1013 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11 %s -; RUN: not llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s -filetype=null 2>&1 | FileCheck -check-prefix=ERR %s ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll index 6bb104311a4d..ab8d8c192187 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-UNALIGNED %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-NOUNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-NOUNALIGNED %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s @@ -64,6 +66,52 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { ; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7 ; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; +; GFX1250-UNALIGNED-LABEL: v_load_constant_v3i32_align1: +; GFX1250-UNALIGNED: ; %bb.0: +; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX1250-UNALIGNED-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1: +; GFX1250-NOUNALIGNED: ; %bb.0: +; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NOUNALIGNED-NEXT: s_clause 0xb +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v2, v[0:1], off +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v3, v[0:1], off offset:1 +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v4, v[0:1], off offset:2 +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v5, v[0:1], off offset:3 +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v6, v[0:1], off offset:4 +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v7, v[0:1], off offset:5 +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v8, v[0:1], off offset:6 +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v9, v[0:1], off offset:7 +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v10, v[0:1], off offset:8 +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v11, v[0:1], off offset:9 +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v12, v[0:1], off offset:11 +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v0, v[0:1], off offset:10 +; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0xa +; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0 +; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x8 +; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v3, 16, v4 :: v_dual_lshlrev_b32 v2, 24, v5 +; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x6 +; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v6 +; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4 +; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v6, 16, v8 :: v_dual_lshlrev_b32 v5, 24, v9 +; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2 +; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v10 +; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x1 +; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v12 +; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v1 +; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v4 +; GFX1250-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7 +; GFX1250-NOUNALIGNED-NEXT: s_set_pc_i64 s[30:31] +; ; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -256,6 +304,34 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) { ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6 ; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; +; GFX1250-UNALIGNED-LABEL: v_load_constant_v3i32_align2: +; GFX1250-UNALIGNED: ; %bb.0: +; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX1250-UNALIGNED-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: +; GFX1250-NOUNALIGNED: ; %bb.0: +; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NOUNALIGNED-NEXT: s_clause 0x5 +; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v2, v[0:1], off +; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v3, v[0:1], off offset:2 +; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v4, v[0:1], off offset:4 +; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v5, v[0:1], off offset:6 +; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v6, v[0:1], off offset:8 +; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v7, v[0:1], off offset:10 +; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4 +; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0 +; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2 +; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v5, 16, v4 +; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6 +; GFX1250-NOUNALIGNED-NEXT: s_set_pc_i64 s[30:31] +; ; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -346,16 +422,35 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) { } define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) { -; GFX12-LABEL: v_load_constant_v3i32_align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align4: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align4: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_constant_v3i32_align4: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX9-LABEL: v_load_constant_v3i32_align4: ; GFX9: ; %bb.0: @@ -392,16 +487,35 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) { } define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) { -; GFX12-LABEL: v_load_constant_i96_align8: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-UNALIGNED-LABEL: v_load_constant_i96_align8: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-NOUNALIGNED-LABEL: v_load_constant_i96_align8: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_constant_i96_align8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX9-LABEL: v_load_constant_i96_align8: ; GFX9: ; %bb.0: @@ -438,16 +552,35 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) { } define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) { -; GFX12-LABEL: v_load_constant_v3i32_align8: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align8: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align8: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_constant_v3i32_align8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX9-LABEL: v_load_constant_v3i32_align8: ; GFX9: ; %bb.0: @@ -484,16 +617,35 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) { } define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) { -; GFX12-LABEL: v_load_constant_v6i16_align8: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-UNALIGNED-LABEL: v_load_constant_v6i16_align8: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-NOUNALIGNED-LABEL: v_load_constant_v6i16_align8: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_constant_v6i16_align8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX9-LABEL: v_load_constant_v6i16_align8: ; GFX9: ; %bb.0: @@ -539,28 +691,67 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) { } define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) { -; GFX12-LABEL: v_load_constant_v12i8_align8: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v13, 8, v0 -; GFX12-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX12-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX12-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX12-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13 -; GFX12-NEXT: v_mov_b32_e32 v8, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v12 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-UNALIGNED-LABEL: v_load_constant_v12i8_align8: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v0 +; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX12-UNALIGNED-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13 +; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v8, v2 +; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v2, v12 +; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-NOUNALIGNED-LABEL: v_load_constant_v12i8_align8: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v0 +; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX12-NOUNALIGNED-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13 +; GFX12-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, v2 +; GFX12-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, v12 +; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_constant_v12i8_align8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshrrev_b32 v13, 8, v0 :: v_dual_lshrrev_b32 v12, 16, v0 +; GFX1250-NEXT: v_dual_lshrrev_b32 v3, 24, v0 :: v_dual_lshrrev_b32 v5, 8, v1 +; GFX1250-NEXT: v_dual_lshrrev_b32 v6, 16, v1 :: v_dual_lshrrev_b32 v7, 24, v1 +; GFX1250-NEXT: v_dual_lshrrev_b32 v9, 8, v2 :: v_dual_lshrrev_b32 v10, 16, v2 +; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_lshrrev_b32 v11, 24, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v1, v13 +; GFX1250-NEXT: v_mov_b32_e32 v2, v12 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX9-LABEL: v_load_constant_v12i8_align8: ; GFX9: ; %bb.0: @@ -632,16 +823,35 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) { } define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) { -; GFX12-LABEL: v_load_constant_v3i32_align16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align16: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align16: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_constant_v3i32_align16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX9-LABEL: v_load_constant_v3i32_align16: ; GFX9: ; %bb.0: @@ -720,6 +930,53 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg ; GFX12-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s5 ; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog ; +; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align1: +; GFX1250-UNALIGNED: ; %bb.0: +; GFX1250-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1] +; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1250-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1: +; GFX1250-NOUNALIGNED: ; %bb.0: +; GFX1250-NOUNALIGNED-NEXT: s_clause 0xa +; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s2, s[0:1], 0x1 +; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s3, s[0:1], 0x3 +; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s4, s[0:1], 0x2 +; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s5, s[0:1], 0x5 +; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s6, s[0:1], 0x7 +; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s7, s[0:1], 0x6 +; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s8, s[0:1], 0x9 +; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s9, s[0:1], 0xb +; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s10, s[0:1], 0x0 +; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s11, s[0:1], 0x4 +; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s12, s[0:1], 0xa +; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0 +; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s1, s[0:1], 0x8 +; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 8 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s3, 24 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s3, s4, 16 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s4, s5, 8 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s3 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s5, s6, 24 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s6, s7, 16 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s7, s8, 8 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s0, s10 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s8, s9, 24 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s2, s0 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s12, 16 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s3, s4, s11 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s4, s5, s6 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s5, s7, s1 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s8, s2 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s4, s3 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s5 +; GFX1250-NOUNALIGNED-NEXT: ; return to shader part epilog +; ; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 @@ -916,6 +1173,34 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg ; GFX12-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s7 ; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog ; +; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align2: +; GFX1250-UNALIGNED: ; %bb.0: +; GFX1250-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1] +; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0 +; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1250-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: +; GFX1250-NOUNALIGNED: ; %bb.0: +; GFX1250-NOUNALIGNED-NEXT: s_clause 0x5 +; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s2, s[0:1], 0x2 +; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s3, s[0:1], 0x6 +; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s4, s[0:1], 0xa +; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s5, s[0:1], 0x0 +; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s6, s[0:1], 0x4 +; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s7, s[0:1], 0x8 +; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 16 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s1, s3, 16 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s4, 16 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s0, s5 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s1, s6 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s7 +; GFX1250-NOUNALIGNED-NEXT: ; return to shader part epilog +; ; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 6baa10bb4862..8533e34ff13f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -807,10 +807,10 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) { ; GFX8-LABEL: v_lshr_v2i16_15: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, 15 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 15, v0 -; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 31 +; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 15, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_lshr_v2i16_15: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll index 07d5ff2036d9..b75eb737534e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -1379,45 +1379,43 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in ; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX6-NEXT: v_lshl_b64 v[1:2], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, 2 +; GFX6-NEXT: v_mov_b32_e32 v0, 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, 2 +; GFX7-NEXT: v_mov_b32_e32 v0, 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: v_mov_b32_e32 v4, 2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo -; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_mov_b32_e32 v2, 2 +; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll index afabc7b62386..917b50f14bfc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -99,15 +99,13 @@ define i32 @v_orn2_i32(i32 %src0, i32 %src1) { ; GCN-LABEL: v_orn2_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_not_b32_e32 v1, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_bfi_b32 v0, v1, v0, -1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_orn2_i32: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1 -; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: v_bfi_b32 v0, v1, v0, -1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i32 %src1, -1 %or = or i32 %src0, %not.src1 @@ -117,14 +115,12 @@ define i32 @v_orn2_i32(i32 %src0, i32 %src1) { define amdgpu_ps float @v_orn2_i32_sv(i32 inreg %src0, i32 %src1) { ; GCN-LABEL: v_orn2_i32_sv: ; GCN: ; %bb.0: -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: v_or_b32_e32 v0, s2, v0 +; GCN-NEXT: v_bfi_b32 v0, v0, s2, -1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_orn2_i32_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0 -; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, s2, -1 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %or = or i32 %src0, %not.src1 @@ -135,14 +131,12 @@ define amdgpu_ps float @v_orn2_i32_sv(i32 inreg %src0, i32 %src1) { define amdgpu_ps float @v_orn2_i32_vs(i32 %src0, i32 inreg %src1) { ; GCN-LABEL: v_orn2_i32_vs: ; GCN: ; %bb.0: -; GCN-NEXT: s_not_b32 s0, s2 -; GCN-NEXT: v_or_b32_e32 v0, s0, v0 +; GCN-NEXT: v_bfi_b32 v0, s2, v0, -1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_orn2_i32_vs: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_not_b32 s0, s2 -; GFX10PLUS-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10PLUS-NEXT: v_bfi_b32 v0, s2, v0, -1 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %or = or i32 %src0, %not.src1 @@ -247,19 +241,15 @@ define i64 @v_orn2_i64(i64 %src0, i64 %src1) { ; GCN-LABEL: v_orn2_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_not_b32_e32 v2, v2 -; GCN-NEXT: v_not_b32_e32 v3, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_bfi_b32 v0, v2, v0, -1 +; GCN-NEXT: v_bfi_b32 v1, v3, v1, -1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_orn2_i64: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_not_b32_e32 v2, v2 -; GFX10PLUS-NEXT: v_not_b32_e32 v3, v3 -; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10PLUS-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10PLUS-NEXT: v_bfi_b32 v0, v2, v0, -1 +; GFX10PLUS-NEXT: v_bfi_b32 v1, v3, v1, -1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i64 %src1, -1 %or = or i64 %src0, %not.src1 @@ -269,18 +259,14 @@ define i64 @v_orn2_i64(i64 %src0, i64 %src1) { define amdgpu_ps <2 x float> @v_orn2_i64_sv(i64 inreg %src0, i64 %src1) { ; GCN-LABEL: v_orn2_i64_sv: ; GCN: ; %bb.0: -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: v_not_b32_e32 v1, v1 -; GCN-NEXT: v_or_b32_e32 v0, s2, v0 -; GCN-NEXT: v_or_b32_e32 v1, s3, v1 +; GCN-NEXT: v_bfi_b32 v0, v0, s2, -1 +; GCN-NEXT: v_bfi_b32 v1, v1, s3, -1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_orn2_i64_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0 -; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1 -; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX10PLUS-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, s2, -1 +; GFX10PLUS-NEXT: v_bfi_b32 v1, v1, s3, -1 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %or = or i64 %src0, %not.src1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir new file mode 100644 index 000000000000..f2d3272e8727 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir @@ -0,0 +1,77 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1200 -run-pass=amdgpu-regbank-combiner %s -o - | FileCheck %s + +# COM: Check that the pass doesn't crash. + +--- +name: test_inline_asm +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_inline_asm + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %5(s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[FMUL]], %5, [[COPY2]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + %2:vgpr(s32) = COPY %1(s32) + %3:vgpr(s32) = G_FMUL %0, %2 + %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %5:vgpr_32 + %6:vgpr(s32) = COPY %4(s32) + %7:vgpr(s32) = nnan G_AMDGPU_FMED3 %3(s32), %5(s32), %6(s32) + $vgpr0 = COPY %7(s32) +... + +--- +name: test_unmerge_values +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_unmerge_values + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[FMUL]], [[C2]], [[COPY2]] + ; CHECK-NEXT: $vgpr0 = COPY [[C2]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + %2:vgpr(s32) = COPY %1(s32) + %3:vgpr(s32) = G_FMUL %0, %2 + %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %5:vgpr(s64) = G_CONSTANT i64 123456789 + %6:vgpr(s32), %7:vgpr(s32) = G_UNMERGE_VALUES %5(s64) + %8:vgpr(s32) = COPY %4(s32) + %9:vgpr(s32) = nnan G_AMDGPU_FMED3 %3(s32), %7(s32), %8(s32) + $vgpr0 = COPY %7(s32) +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll index 89681e7329e7..c82f7c53696d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -2,6 +2,7 @@ ; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s -check-prefix=GFX7 ; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s -check-prefix=GFX7 ; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -simplify-mir -stop-after=regbankselect -o - %s | FileCheck %s -check-prefix=GFX12 +; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -simplify-mir -stop-after=regbankselect -o - %s | FileCheck %s -check-prefix=GFX12 ; Natural mapping define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir index 1b64099d6bf5..e448c4cba094 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX7 # RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX12 +# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s -check-prefixes=GCN,GFX12 --- | define amdgpu_kernel void @load_global_v8i32_non_uniform(ptr addrspace(1) %in) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir index 997ac804f710..b2ff0995ce57 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck -check-prefix=GFX7 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck -check-prefix=GFX12 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck -check-prefix=GFX12 %s --- | diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 832f066adaa8..2f956d7a0a53 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -229,21 +229,23 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 +; GFX6-NEXT: v_min_i32_e32 v6, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v7, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v7, v6 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 -; GFX6-NEXT: v_max_i32_e32 v1, v5, v1 +; GFX6-NEXT: v_max_i32_e32 v1, v6, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v5, -2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -2951,20 +2953,22 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: saddsat_v2i16_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 +; GFX6-NEXT: v_min_i32_e32 v4, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 -; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 +; GFX6-NEXT: v_max_i32_e32 v4, s0, v4 +; GFX6-NEXT: v_min_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 -; GFX6-NEXT: v_min_i32_e32 v3, 0, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_lshl_b32 s0, s1, 16 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_min_i32_e32 v3, 0, v1 +; GFX6-NEXT: s_lshl_b32 s0, s1, 16 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 8d8eca162257..19dc20c51004 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -1067,24 +1067,24 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 ; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] -; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, vcc, v9, v1, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v9, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 +; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 -; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, 0x1000, v4 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc @@ -1660,24 +1660,24 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 ; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] -; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, vcc, v9, v1, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v9, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 +; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 -; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, 0x12d8fb, v4 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 2673ac4fb5ba..c1b225562b77 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -233,16 +233,17 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 -; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GFX6-NEXT: v_min_i32_e32 v6, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v7, 1 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 -; GFX6-NEXT: v_min_i32_e32 v1, v1, v5 +; GFX6-NEXT: v_min_i32_e32 v1, v1, v6 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x80000001 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000001, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 @@ -1260,7 +1261,8 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 @@ -1279,7 +1281,8 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000001, v4 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x80000000, v5 +; GFX8-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6 ; GFX8-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index 38ef707fa65a..3685eed5043a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -71,14 +71,14 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s5, 0xffff, s0 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: s_lshr_b32 s0, s5, 8 +; GFX9-NEXT: s_lshr_b32 s5, s5, 8 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -90,7 +90,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 @@ -102,7 +102,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: s_lshr_b32 s1, s2, 24 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 @@ -114,7 +114,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:13 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: s_lshr_b32 s1, s3, 24 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:14 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 @@ -181,37 +181,37 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: s_lshr_b32 s0, s1, 16 -; GFX10-NEXT: s_and_b32 s6, 0xffff, s1 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s0 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: s_lshr_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s7, 0xffff, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s2 -; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_lshr_b32 s5, s4, 8 -; GFX10-NEXT: s_lshr_b32 s4, s6, 8 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_lshr_b32 s8, s2, 16 +; GFX10-NEXT: s_and_b32 s9, 0xffff, s2 +; GFX10-NEXT: s_lshr_b32 s5, s5, 8 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: s_lshr_b32 s0, s7, 8 -; GFX10-NEXT: v_mov_b32_e32 v7, s5 -; GFX10-NEXT: v_mov_b32_e32 v8, s4 -; GFX10-NEXT: v_mov_b32_e32 v9, s6 +; GFX10-NEXT: v_mov_b32_e32 v6, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: s_lshr_b32 s1, s9, 8 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 +; GFX10-NEXT: v_mov_b32_e32 v9, s0 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 -; GFX10-NEXT: ds_write_b8 v1, v5 offset:6 -; GFX10-NEXT: ds_write_b8 v1, v6 offset:1 -; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 -; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 -; GFX10-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-NEXT: v_mov_b32_e32 v10, s0 -; GFX10-NEXT: s_lshr_b32 s0, s1, 8 -; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 +; GFX10-NEXT: ds_write_b8 v1, v5 offset:3 +; GFX10-NEXT: ds_write_b8 v1, v6 offset:6 +; GFX10-NEXT: ds_write_b8 v1, v8 offset:1 +; GFX10-NEXT: ds_write_b8 v1, v9 offset:5 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v10, s1 +; GFX10-NEXT: s_lshr_b32 s0, s2, 24 +; GFX10-NEXT: ds_write_b8 v1, v7 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 ; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 @@ -221,7 +221,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: s_lshr_b32 s0, s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s0 -; GFX10-NEXT: s_lshr_b32 s0, s1, 8 +; GFX10-NEXT: s_lshr_b32 s0, s3, 24 ; GFX10-NEXT: v_mov_b32_e32 v4, s1 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:11 @@ -240,38 +240,37 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s0 ; GFX11-NEXT: s_lshr_b32 s5, s0, 16 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 -; GFX11-NEXT: s_lshr_b32 s0, s1, 16 -; GFX11-NEXT: s_and_b32 s4, 0xffff, s1 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-NEXT: s_and_b32 s7, 0xffff, s1 +; GFX11-NEXT: s_lshr_b32 s6, s6, 8 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 -; GFX11-NEXT: s_lshr_b32 s1, s2, 16 -; GFX11-NEXT: s_and_b32 s7, 0xffff, s2 -; GFX11-NEXT: s_lshr_b32 s2, s6, 8 -; GFX11-NEXT: s_lshr_b32 s6, s5, 8 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6 +; GFX11-NEXT: s_lshr_b32 s1, s1, 24 ; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: s_lshr_b32 s4, s4, 8 -; GFX11-NEXT: s_lshr_b32 s5, s0, 8 ; GFX11-NEXT: s_lshr_b32 s0, s7, 8 -; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s6 +; GFX11-NEXT: s_and_b32 s9, 0xffff, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: s_lshr_b32 s0, s2, 24 +; GFX11-NEXT: s_lshr_b32 s1, s9, 8 ; GFX11-NEXT: ds_store_b8 v1, v0 -; GFX11-NEXT: ds_store_b8 v1, v6 offset:1 +; GFX11-NEXT: ds_store_b8 v1, v7 offset:1 ; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 -; GFX11-NEXT: ds_store_b8 v1, v7 offset:3 +; GFX11-NEXT: ds_store_b8 v1, v5 offset:3 ; GFX11-NEXT: ds_store_b8 v1, v2 offset:4 -; GFX11-NEXT: ds_store_b8 v1, v8 offset:5 -; GFX11-NEXT: ds_store_b8 v1, v5 offset:6 -; GFX11-NEXT: ds_store_b8 v1, v9 offset:7 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s3 -; GFX11-NEXT: s_lshr_b32 s0, s1, 8 -; GFX11-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-NEXT: ds_store_b8 v1, v9 offset:5 +; GFX11-NEXT: ds_store_b8 v1, v6 offset:6 +; GFX11-NEXT: ds_store_b8 v1, v8 offset:7 ; GFX11-NEXT: v_mov_b32_e32 v4, s0 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s3 -; GFX11-NEXT: s_lshr_b32 s1, s3, 16 +; GFX11-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v5, s3 ; GFX11-NEXT: s_lshr_b32 s0, s0, 8 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 -; GFX11-NEXT: s_lshr_b32 s0, s1, 8 +; GFX11-NEXT: s_lshr_b32 s1, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v7, s1 +; GFX11-NEXT: v_mov_b32_e32 v6, s0 +; GFX11-NEXT: s_lshr_b32 s0, s3, 24 ; GFX11-NEXT: v_mov_b32_e32 v8, s0 ; GFX11-NEXT: ds_store_b8 v1, v3 offset:8 ; GFX11-NEXT: ds_store_b8 v1, v0 offset:9 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll index 1d2d330eeb61..cce6bd9301cb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -72,15 +72,15 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5 ; GFX9-NEXT: s_and_b32 s5, 0xffff, s0 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s0, s5, 8 +; GFX9-NEXT: s_lshr_b32 s3, s5, 8 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -92,7 +92,7 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 @@ -104,7 +104,7 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: s_lshr_b32 s1, s2, 24 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 @@ -163,37 +163,37 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_lshr_b32 s0, s1, 16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_and_b32 s3, 0xffff, s1 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: s_lshr_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s6, 0xffff, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s2 -; GFX10-NEXT: s_lshr_b32 s2, s5, 8 -; GFX10-NEXT: s_lshr_b32 s5, s4, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_and_b32 s8, 0xffff, s2 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_lshr_b32 s4, s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: s_lshr_b32 s0, s6, 8 -; GFX10-NEXT: v_mov_b32_e32 v9, s4 -; GFX10-NEXT: s_lshr_b32 s3, s3, 8 -; GFX10-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-NEXT: v_mov_b32_e32 v10, s0 -; GFX10-NEXT: s_lshr_b32 s0, s1, 8 -; GFX10-NEXT: v_mov_b32_e32 v7, s5 -; GFX10-NEXT: v_mov_b32_e32 v8, s3 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: s_lshr_b32 s1, s8, 8 +; GFX10-NEXT: s_lshr_b32 s7, s2, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: s_lshr_b32 s5, s5, 8 +; GFX10-NEXT: v_mov_b32_e32 v6, s3 +; GFX10-NEXT: v_mov_b32_e32 v9, s0 +; GFX10-NEXT: v_mov_b32_e32 v10, s1 +; GFX10-NEXT: s_lshr_b32 s0, s2, 24 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 -; GFX10-NEXT: ds_write_b8 v1, v5 offset:6 -; GFX10-NEXT: ds_write_b8 v1, v6 offset:1 -; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 -; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 -; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: ds_write_b8 v1, v5 offset:3 +; GFX10-NEXT: ds_write_b8 v1, v6 offset:6 +; GFX10-NEXT: ds_write_b8 v1, v8 offset:1 +; GFX10-NEXT: ds_write_b8 v1, v9 offset:5 +; GFX10-NEXT: v_mov_b32_e32 v0, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 +; GFX10-NEXT: ds_write_b8 v1, v7 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 ; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 @@ -206,37 +206,37 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s0 +; GFX11-NEXT: s_lshr_b32 s7, s2, 16 +; GFX11-NEXT: s_lshr_b32 s5, s5, 8 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_lshr_b32 s0, s1, 16 -; GFX11-NEXT: s_and_b32 s3, 0xffff, s1 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_lshr_b32 s3, s1, 16 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 -; GFX11-NEXT: s_lshr_b32 s1, s2, 16 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s2 -; GFX11-NEXT: s_lshr_b32 s2, s5, 8 -; GFX11-NEXT: s_lshr_b32 s5, s4, 8 -; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, 24 +; GFX11-NEXT: s_and_b32 s8, 0xffff, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s7 :: v_dual_mov_b32 v9, s5 ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: s_lshr_b32 s3, s3, 8 -; GFX11-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-NEXT: s_lshr_b32 s2, s2, 24 ; GFX11-NEXT: s_lshr_b32 s0, s6, 8 -; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s3 -; GFX11-NEXT: v_dual_mov_b32 v10, s4 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_mov_b32_e32 v12, s6 +; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s1 +; GFX11-NEXT: s_lshr_b32 s1, s8, 8 +; GFX11-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_mov_b32_e32 v12, s1 ; GFX11-NEXT: ds_store_b8 v1, v0 -; GFX11-NEXT: ds_store_b8 v1, v7 offset:1 +; GFX11-NEXT: ds_store_b8 v1, v9 offset:1 ; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 -; GFX11-NEXT: ds_store_b8 v1, v8 offset:3 +; GFX11-NEXT: ds_store_b8 v1, v5 offset:3 ; GFX11-NEXT: ds_store_b8 v1, v2 offset:4 -; GFX11-NEXT: ds_store_b8 v1, v9 offset:5 -; GFX11-NEXT: ds_store_b8 v1, v5 offset:6 -; GFX11-NEXT: ds_store_b8 v1, v10 offset:7 +; GFX11-NEXT: ds_store_b8 v1, v11 offset:5 +; GFX11-NEXT: ds_store_b8 v1, v6 offset:6 +; GFX11-NEXT: ds_store_b8 v1, v7 offset:7 ; GFX11-NEXT: ds_store_b8 v1, v3 offset:8 -; GFX11-NEXT: ds_store_b8 v1, v11 offset:9 -; GFX11-NEXT: ds_store_b8 v1, v6 offset:10 -; GFX11-NEXT: ds_store_b8 v1, v12 offset:11 +; GFX11-NEXT: ds_store_b8 v1, v12 offset:9 +; GFX11-NEXT: ds_store_b8 v1, v8 offset:10 +; GFX11-NEXT: ds_store_b8 v1, v10 offset:11 ; GFX11-NEXT: s_endpgm store <3 x i32> %x, ptr addrspace(3) %out, align 1 ret void |
