diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/calling-conventions.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/calling-conventions.ll | 1181 |
1 files changed, 1171 insertions, 10 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 2db7b28c7de9..ddd3b1520bf5 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1250,GFX1250-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1250,GFX1250-FAKE16 %s ; Make sure we don't crash or assert on spir_kernel calling convention. @@ -34,6 +36,14 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: kernel: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX1250-NEXT: s_endpgm entry: store i32 0, ptr addrspace(1) %out ret void @@ -70,6 +80,16 @@ define amdgpu_ps half @ps_ret_cc_f16(half %arg0) { ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-TRUE16-LABEL: ps_ret_cc_f16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l +; GFX1250-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1250-FAKE16-LABEL: ps_ret_cc_f16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX1250-FAKE16-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } @@ -96,26 +116,71 @@ define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) { ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s0, 1.0 ; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: ps_ret_cc_inreg_f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_add_f16 s0, s0, 1.0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } define fastcc float @fastcc(float %arg0) #0 { -; GCN-LABEL: fastcc: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: fastcc: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_add_f32_e32 v0, 4.0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: fastcc: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_f32_e32 v0, 4.0, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fastcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fastcc: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_add_f32_e32 v0, 4.0, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %add = fadd float %arg0, 4.0 ret float %add } define coldcc float @coldcc(float %arg0) #0 { -; GCN-LABEL: coldcc: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: coldcc: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_add_f32_e32 v0, 4.0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: coldcc: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_f32_e32 v0, 4.0, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: coldcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: coldcc: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_add_f32_e32 v0, 4.0, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %add = fadd float %arg0, 4.0 ret float %add } @@ -209,6 +274,23 @@ define amdgpu_kernel void @call_coldcc() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: call_coldcc: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1250-NEXT: s_get_pc_i64 s[6:7] +; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[6:7], coldcc@gotpcrel+4 +; GFX1250-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 +; GFX1250-NEXT: s_load_b64 s[12:13], s[6:7], 0x0 +; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], 36 +; GFX1250-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1250-NEXT: s_mov_b32 s32, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[12:13] +; GFX1250-NEXT: global_store_b32 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm %val = call coldcc float @coldcc(float 1.0) store float %val, ptr addrspace(1) poison ret void @@ -303,6 +385,23 @@ define amdgpu_kernel void @call_fastcc() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: call_fastcc: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1250-NEXT: s_get_pc_i64 s[6:7] +; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[6:7], fastcc@gotpcrel+4 +; GFX1250-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 +; GFX1250-NEXT: s_load_b64 s[12:13], s[6:7], 0x0 +; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], 36 +; GFX1250-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1250-NEXT: s_mov_b32 s32, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[12:13] +; GFX1250-NEXT: global_store_b32 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm %val = call fastcc float @fastcc(float 1.0) store float %val, ptr addrspace(1) poison ret void @@ -331,6 +430,16 @@ define amdgpu_cs half @cs_mesa(half %arg0) { ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-TRUE16-LABEL: cs_mesa: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l +; GFX1250-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1250-FAKE16-LABEL: cs_mesa: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX1250-FAKE16-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } @@ -358,6 +467,16 @@ define amdgpu_ps half @ps_mesa_f16(half %arg0) { ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-TRUE16-LABEL: ps_mesa_f16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l +; GFX1250-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1250-FAKE16-LABEL: ps_mesa_f16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX1250-FAKE16-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } @@ -385,6 +504,16 @@ define amdgpu_vs half @vs_mesa(half %arg0) { ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-TRUE16-LABEL: vs_mesa: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l +; GFX1250-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1250-FAKE16-LABEL: vs_mesa: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX1250-FAKE16-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } @@ -412,6 +541,16 @@ define amdgpu_gs half @gs_mesa(half %arg0) { ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-TRUE16-LABEL: gs_mesa: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l +; GFX1250-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1250-FAKE16-LABEL: gs_mesa: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX1250-FAKE16-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } @@ -439,6 +578,16 @@ define amdgpu_hs half @hs_mesa(half %arg0) { ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-TRUE16-LABEL: hs_mesa: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l +; GFX1250-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1250-FAKE16-LABEL: hs_mesa: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX1250-FAKE16-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } @@ -468,6 +617,11 @@ define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: ps_mesa_v2f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX1250-NEXT: ; return to shader part epilog %add = fadd <2 x half> %arg0, <half 1.0, half 1.0> ret <2 x half> %add } @@ -497,6 +651,11 @@ define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: ps_mesa_inreg_v2f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0] +; GFX1250-NEXT: ; return to shader part epilog %add = fadd <2 x half> %arg0, <half 1.0, half 1.0> ret <2 x half> %add } @@ -528,6 +687,12 @@ define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) { ; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: ps_mesa_v2i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX1250-NEXT: global_store_b32 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm %add = add <2 x i16> %arg0, <i16 1, i16 1> store <2 x i16> %add, ptr addrspace(1) poison ret void @@ -563,6 +728,12 @@ define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) { ; GFX11-NEXT: v_pk_add_u16 v0, s0, 1 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: ps_mesa_inreg_v2i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_pk_add_u16 v0, s0, 1 op_sel_hi:[1,0] +; GFX1250-NEXT: global_store_b32 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm %add = add <2 x i16> %arg0, <i16 1, i16 1> store <2 x i16> %add, ptr addrspace(1) poison ret void @@ -603,6 +774,12 @@ define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) { ; GFX11-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: ps_mesa_v4f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX1250-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX1250-NEXT: ; return to shader part epilog %add = fadd <4 x half> %arg0, <half 1.0, half 1.0, half 1.0, half 1.0> ret <4 x half> %add } @@ -644,6 +821,12 @@ define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) { ; GFX11-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_f16 v1, s1, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: ps_mesa_inreg_v4f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0] +; GFX1250-NEXT: v_pk_add_f16 v1, s1, 1.0 op_sel_hi:[1,0] +; GFX1250-NEXT: ; return to shader part epilog %add = fadd <4 x half> %arg0, <half 1.0, half 1.0, half 1.0, half 1.0> ret <4 x half> %add } @@ -685,6 +868,17 @@ define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) { ; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: ps_mesa_inreg_v3i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_add_co_i32 s2, s2, 3 +; GFX1250-NEXT: s_add_co_i32 s0, s0, 1 +; GFX1250-NEXT: s_add_co_i32 s1, s1, 2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX1250-NEXT: s_endpgm %add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3> store <3 x i32> %add, ptr addrspace(1) poison ret void @@ -717,6 +911,17 @@ define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) { ; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 ; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: ps_mesa_inreg_v3f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_add_f32 s0, s0, 1.0 +; GFX1250-NEXT: s_add_f32 s1, s1, 2.0 +; GFX1250-NEXT: s_add_f32 s2, s2, 4.0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_2) +; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX1250-NEXT: s_endpgm %add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0> store <3 x float> %add, ptr addrspace(1) poison ret void @@ -772,6 +977,22 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) { ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: ps_mesa_inreg_v5i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_add_co_i32 s3, s3, 4 +; GFX1250-NEXT: s_add_co_i32 s2, s2, 3 +; GFX1250-NEXT: s_add_co_i32 s1, s1, 2 +; GFX1250-NEXT: s_add_co_i32 s4, s4, 5 +; GFX1250-NEXT: s_add_co_i32 s0, s0, 1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v0, s0 +; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v3, s3 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b32 v[0:1], v4, off +; GFX1250-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX1250-NEXT: s_endpgm %add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5> store <5 x i32> %add, ptr addrspace(1) poison ret void @@ -813,6 +1034,22 @@ define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) { ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: ps_mesa_inreg_v5f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_add_f32 s3, s3, -1.0 +; GFX1250-NEXT: s_add_f32 s4, s4, 0.5 +; GFX1250-NEXT: s_add_f32 s0, s0, 1.0 +; GFX1250-NEXT: s_add_f32 s1, s1, 2.0 +; GFX1250-NEXT: s_add_f32 s2, s2, 4.0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_2) +; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v0, s0 +; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v3, s3 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b32 v[0:1], v4, off +; GFX1250-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX1250-NEXT: s_endpgm %add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5> store <5 x float> %add, ptr addrspace(1) poison ret void @@ -845,6 +1082,13 @@ define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) { ; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: ps_mesa_v3i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_add_nc_u32 v2, 3, v2 :: v_dual_add_nc_u32 v1, 2, v1 +; GFX1250-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX1250-NEXT: s_endpgm %add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3> store <3 x i32> %add, ptr addrspace(1) poison ret void @@ -876,6 +1120,13 @@ define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) { ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: ps_mesa_v3f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_add_f32 v2, 4.0, v2 :: v_dual_add_f32 v1, 2.0, v1 +; GFX1250-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX1250-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX1250-NEXT: s_endpgm %add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0> store <3 x float> %add, ptr addrspace(1) poison ret void @@ -917,6 +1168,16 @@ define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) { ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: ps_mesa_v5i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_add_nc_u32 v3, 4, v3 :: v_dual_add_nc_u32 v2, 3, v2 +; GFX1250-NEXT: v_dual_add_nc_u32 v1, 2, v1 :: v_dual_add_nc_u32 v4, 5, v4 +; GFX1250-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b32 v[0:1], v4, off +; GFX1250-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX1250-NEXT: s_endpgm %add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5> store <5 x i32> %add, ptr addrspace(1) poison ret void @@ -956,6 +1217,16 @@ define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) { ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: ps_mesa_v5f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2 +; GFX1250-NEXT: v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4 +; GFX1250-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b32 v[0:1], v4, off +; GFX1250-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX1250-NEXT: s_endpgm %add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5> store <5 x float> %add, ptr addrspace(1) poison ret void @@ -987,6 +1258,18 @@ define amdgpu_ps void @ps_mesa_i16(i16 %arg0) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v0 ; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v0, off ; GFX11-FAKE16-NEXT: s_endpgm +; +; GFX1250-TRUE16-LABEL: ps_mesa_i16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.l +; GFX1250-TRUE16-NEXT: flat_store_b16 v[0:1], v0 +; GFX1250-TRUE16-NEXT: s_endpgm +; +; GFX1250-FAKE16-LABEL: ps_mesa_i16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: v_add_nc_u16 v0, v0, v0 +; GFX1250-FAKE16-NEXT: global_store_b16 v[0:1], v0, off +; GFX1250-FAKE16-NEXT: s_endpgm %add = add i16 %arg0, %arg0 store i16 %add, ptr addrspace(1) poison ret void @@ -1016,6 +1299,14 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) { ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: ps_mesa_inreg_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: global_store_b16 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm %add = add i16 %arg0, %arg0 store i16 %add, ptr addrspace(1) poison ret void @@ -1059,6 +1350,16 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b8 v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amd_kernel_i8: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: global_store_b8 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm entry: %add = add i8 %arg0, %arg0 store i8 %add, ptr addrspace(1) poison @@ -1114,6 +1415,22 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amd_kernel_v2i8: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 +; GFX1250-NEXT: s_add_co_i32 s1, s1, s1 +; GFX1250-NEXT: s_and_b32 s0, s0, 0xff +; GFX1250-NEXT: s_lshl_b32 s1, s1, 8 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: global_store_b16 v[0:1], v2, off +; GFX1250-NEXT: s_endpgm entry: %add = add <2 x i8> %arg0, %arg0 store <2 x i8> %add, ptr addrspace(1) null @@ -1199,6 +1516,32 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amd_kernel_v4i8: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_lshr_b32 s1, s0, 16 +; GFX1250-NEXT: s_lshr_b32 s2, s0, 24 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s0 +; GFX1250-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s2 +; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 +; GFX1250-NEXT: s_add_co_i32 s1, s1, s1 +; GFX1250-NEXT: s_and_b32 s3, s3, 0xff +; GFX1250-NEXT: s_lshl_b32 s0, s0, 8 +; GFX1250-NEXT: s_lshl_b32 s2, s2, 8 +; GFX1250-NEXT: s_and_b32 s1, s1, 0xff +; GFX1250-NEXT: s_or_b32 s0, s3, s0 +; GFX1250-NEXT: s_or_b32 s1, s1, s2 +; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff +; GFX1250-NEXT: s_lshl_b32 s1, s1, 16 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: global_store_b32 v[0:1], v2, off +; GFX1250-NEXT: s_endpgm entry: %add = add <4 x i8> %arg0, %arg0 store <4 x i8> %add, ptr addrspace(1) null @@ -1271,6 +1614,27 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; GFX11-NEXT: global_store_b8 v[0:1], v4, off ; GFX11-NEXT: global_store_b16 v[2:3], v5, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amd_kernel_v3i8: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 2 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX1250-NEXT: s_lshr_b32 s1, s0, 16 +; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s2 +; GFX1250-NEXT: s_and_b32 s0, s0, 0xff +; GFX1250-NEXT: s_lshl_b32 s2, s2, 8 +; GFX1250-NEXT: s_add_co_i32 s1, s1, s1 +; GFX1250-NEXT: s_or_b32 s0, s0, s2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b8 v[0:1], v4, off +; GFX1250-NEXT: global_store_b16 v[2:3], v5, off +; GFX1250-NEXT: s_endpgm entry: %add = add <3 x i8> %arg0, %arg0 store <3 x i8> %add, ptr addrspace(1) null @@ -1370,6 +1734,36 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; GFX11-NEXT: global_store_b8 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amd_kernel_v5i8: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_lshr_b32 s2, s0, 16 +; GFX1250-NEXT: s_lshr_b32 s3, s0, 24 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s0 +; GFX1250-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX1250-NEXT: s_add_co_i32 s3, s3, s3 +; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s2 +; GFX1250-NEXT: s_and_b32 s4, s4, 0xff +; GFX1250-NEXT: s_lshl_b32 s0, s0, 8 +; GFX1250-NEXT: s_lshl_b32 s3, s3, 8 +; GFX1250-NEXT: s_and_b32 s2, s2, 0xff +; GFX1250-NEXT: s_or_b32 s0, s4, s0 +; GFX1250-NEXT: s_or_b32 s2, s2, s3 +; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff +; GFX1250-NEXT: s_lshl_b32 s2, s2, 16 +; GFX1250-NEXT: s_add_co_i32 s1, s1, s1 +; GFX1250-NEXT: s_or_b32 s0, s0, s2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b8 v[0:1], v4, off +; GFX1250-NEXT: global_store_b32 v[2:3], v5, off +; GFX1250-NEXT: s_endpgm entry: %add = add <5 x i8> %arg0, %arg0 store <5 x i8> %add, ptr addrspace(1) null @@ -1505,6 +1899,48 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amd_kernel_v8i8: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_lshr_b32 s2, s0, 16 +; GFX1250-NEXT: s_lshr_b32 s3, s0, 24 +; GFX1250-NEXT: s_lshr_b32 s4, s1, 16 +; GFX1250-NEXT: s_lshr_b32 s5, s1, 24 +; GFX1250-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX1250-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX1250-NEXT: s_add_co_i32 s1, s1, s1 +; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 +; GFX1250-NEXT: s_add_co_i32 s7, s7, s7 +; GFX1250-NEXT: s_add_co_i32 s5, s5, s5 +; GFX1250-NEXT: s_add_co_i32 s4, s4, s4 +; GFX1250-NEXT: s_add_co_i32 s6, s6, s6 +; GFX1250-NEXT: s_add_co_i32 s3, s3, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s2 +; GFX1250-NEXT: s_and_b32 s1, s1, 0xff +; GFX1250-NEXT: s_and_b32 s0, s0, 0xff +; GFX1250-NEXT: s_lshl_b32 s7, s7, 8 +; GFX1250-NEXT: s_lshl_b32 s5, s5, 8 +; GFX1250-NEXT: s_and_b32 s4, s4, 0xff +; GFX1250-NEXT: s_lshl_b32 s6, s6, 8 +; GFX1250-NEXT: s_lshl_b32 s3, s3, 8 +; GFX1250-NEXT: s_and_b32 s2, s2, 0xff +; GFX1250-NEXT: s_or_b32 s1, s1, s7 +; GFX1250-NEXT: s_or_b32 s4, s4, s5 +; GFX1250-NEXT: s_or_b32 s0, s0, s6 +; GFX1250-NEXT: s_or_b32 s2, s2, s3 +; GFX1250-NEXT: s_and_b32 s1, s1, 0xffff +; GFX1250-NEXT: s_lshl_b32 s3, s4, 16 +; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff +; GFX1250-NEXT: s_lshl_b32 s2, s2, 16 +; GFX1250-NEXT: s_or_b32 s1, s1, s3 +; GFX1250-NEXT: s_or_b32 s0, s0, s2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-NEXT: s_endpgm entry: %add = add <8 x i8> %arg0, %arg0 store <8 x i8> %add, ptr addrspace(1) null @@ -1740,6 +2176,81 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, 0 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amd_kernel_v16i8: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[4:5], 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_lshr_b32 s6, s1, 16 +; GFX1250-NEXT: s_lshr_b32 s7, s1, 24 +; GFX1250-NEXT: s_lshr_b32 s8, s2, 16 +; GFX1250-NEXT: s_lshr_b32 s9, s2, 24 +; GFX1250-NEXT: s_lshr_b32 s10, s3, 16 +; GFX1250-NEXT: s_lshr_b32 s11, s3, 24 +; GFX1250-NEXT: s_lshr_b32 s4, s0, 16 +; GFX1250-NEXT: s_lshr_b32 s5, s0, 24 +; GFX1250-NEXT: s_bfe_u32 s12, s0, 0x80008 +; GFX1250-NEXT: s_bfe_u32 s13, s1, 0x80008 +; GFX1250-NEXT: s_bfe_u32 s14, s2, 0x80008 +; GFX1250-NEXT: s_bfe_u32 s15, s3, 0x80008 +; GFX1250-NEXT: s_add_co_i32 s11, s11, s11 +; GFX1250-NEXT: s_add_co_i32 s10, s10, s10 +; GFX1250-NEXT: s_add_co_i32 s9, s9, s9 +; GFX1250-NEXT: s_add_co_i32 s8, s8, s8 +; GFX1250-NEXT: s_add_co_i32 s7, s7, s7 +; GFX1250-NEXT: s_add_co_i32 s6, s6, s6 +; GFX1250-NEXT: s_add_co_i32 s3, s3, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s2 +; GFX1250-NEXT: s_add_co_i32 s15, s15, s15 +; GFX1250-NEXT: s_add_co_i32 s14, s14, s14 +; GFX1250-NEXT: s_lshl_b32 s11, s11, 8 +; GFX1250-NEXT: s_and_b32 s10, s10, 0xff +; GFX1250-NEXT: s_lshl_b32 s9, s9, 8 +; GFX1250-NEXT: s_and_b32 s8, s8, 0xff +; GFX1250-NEXT: s_add_co_i32 s1, s1, s1 +; GFX1250-NEXT: s_add_co_i32 s13, s13, s13 +; GFX1250-NEXT: s_lshl_b32 s7, s7, 8 +; GFX1250-NEXT: s_and_b32 s6, s6, 0xff +; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 +; GFX1250-NEXT: s_add_co_i32 s12, s12, s12 +; GFX1250-NEXT: s_add_co_i32 s5, s5, s5 +; GFX1250-NEXT: s_add_co_i32 s4, s4, s4 +; GFX1250-NEXT: s_and_b32 s3, s3, 0xff +; GFX1250-NEXT: s_and_b32 s2, s2, 0xff +; GFX1250-NEXT: s_lshl_b32 s15, s15, 8 +; GFX1250-NEXT: s_or_b32 s10, s10, s11 +; GFX1250-NEXT: s_lshl_b32 s11, s14, 8 +; GFX1250-NEXT: s_or_b32 s8, s8, s9 +; GFX1250-NEXT: s_and_b32 s1, s1, 0xff +; GFX1250-NEXT: s_lshl_b32 s9, s13, 8 +; GFX1250-NEXT: s_or_b32 s6, s6, s7 +; GFX1250-NEXT: s_and_b32 s0, s0, 0xff +; GFX1250-NEXT: s_lshl_b32 s7, s12, 8 +; GFX1250-NEXT: s_lshl_b32 s5, s5, 8 +; GFX1250-NEXT: s_and_b32 s4, s4, 0xff +; GFX1250-NEXT: s_or_b32 s3, s3, s15 +; GFX1250-NEXT: s_or_b32 s2, s2, s11 +; GFX1250-NEXT: s_or_b32 s1, s1, s9 +; GFX1250-NEXT: s_or_b32 s0, s0, s7 +; GFX1250-NEXT: s_or_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1250-NEXT: s_lshl_b32 s10, s10, 16 +; GFX1250-NEXT: s_and_b32 s2, s2, 0xffff +; GFX1250-NEXT: s_lshl_b32 s8, s8, 16 +; GFX1250-NEXT: s_and_b32 s1, s1, 0xffff +; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff +; GFX1250-NEXT: s_lshl_b32 s4, s4, 16 +; GFX1250-NEXT: s_lshl_b32 s5, s6, 16 +; GFX1250-NEXT: s_or_b32 s3, s3, s10 +; GFX1250-NEXT: s_or_b32 s2, s2, s8 +; GFX1250-NEXT: s_or_b32 s0, s0, s4 +; GFX1250-NEXT: s_or_b32 s1, s1, s5 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX1250-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1250-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX1250-NEXT: s_endpgm entry: %add = add <16 x i8> %arg0, %arg0 store <16 x i8> %add, ptr addrspace(1) null @@ -2186,6 +2697,149 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amd_kernel_v32i8: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b64_e32 v[8:9], 16 +; GFX1250-NEXT: v_mov_b64_e32 v[10:11], 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_lshr_b32 s16, s0, 16 +; GFX1250-NEXT: s_lshr_b32 s17, s0, 24 +; GFX1250-NEXT: s_lshr_b32 s20, s2, 16 +; GFX1250-NEXT: s_lshr_b32 s21, s2, 24 +; GFX1250-NEXT: s_lshr_b32 s14, s7, 16 +; GFX1250-NEXT: s_lshr_b32 s15, s7, 24 +; GFX1250-NEXT: s_bfe_u32 s27, s7, 0x80008 +; GFX1250-NEXT: s_add_co_i32 s17, s17, s17 +; GFX1250-NEXT: s_add_co_i32 s16, s16, s16 +; GFX1250-NEXT: s_lshr_b32 s18, s1, 16 +; GFX1250-NEXT: s_lshr_b32 s19, s1, 24 +; GFX1250-NEXT: s_lshr_b32 s22, s3, 16 +; GFX1250-NEXT: s_lshr_b32 s23, s3, 24 +; GFX1250-NEXT: s_bfe_u32 s29, s1, 0x80008 +; GFX1250-NEXT: s_bfe_u32 s30, s3, 0x80008 +; GFX1250-NEXT: s_add_co_i32 s21, s21, s21 +; GFX1250-NEXT: s_add_co_i32 s20, s20, s20 +; GFX1250-NEXT: s_lshl_b32 s17, s17, 8 +; GFX1250-NEXT: s_and_b32 s16, s16, 0xff +; GFX1250-NEXT: s_add_co_i32 s7, s7, s7 +; GFX1250-NEXT: s_add_co_i32 s27, s27, s27 +; GFX1250-NEXT: s_add_co_i32 s15, s15, s15 +; GFX1250-NEXT: s_add_co_i32 s14, s14, s14 +; GFX1250-NEXT: s_add_co_i32 s3, s3, s3 +; GFX1250-NEXT: s_add_co_i32 s30, s30, s30 +; GFX1250-NEXT: s_add_co_i32 s23, s23, s23 +; GFX1250-NEXT: s_add_co_i32 s22, s22, s22 +; GFX1250-NEXT: s_lshl_b32 s21, s21, 8 +; GFX1250-NEXT: s_and_b32 s20, s20, 0xff +; GFX1250-NEXT: s_add_co_i32 s1, s1, s1 +; GFX1250-NEXT: s_add_co_i32 s29, s29, s29 +; GFX1250-NEXT: s_add_co_i32 s19, s19, s19 +; GFX1250-NEXT: s_add_co_i32 s18, s18, s18 +; GFX1250-NEXT: s_lshr_b32 s10, s5, 16 +; GFX1250-NEXT: s_lshr_b32 s11, s5, 24 +; GFX1250-NEXT: s_lshr_b32 s12, s6, 16 +; GFX1250-NEXT: s_lshr_b32 s13, s6, 24 +; GFX1250-NEXT: s_or_b32 s16, s16, s17 +; GFX1250-NEXT: s_and_b32 s7, s7, 0xff +; GFX1250-NEXT: s_lshl_b32 s17, s27, 8 +; GFX1250-NEXT: s_lshl_b32 s15, s15, 8 +; GFX1250-NEXT: s_and_b32 s14, s14, 0xff +; GFX1250-NEXT: s_and_b32 s3, s3, 0xff +; GFX1250-NEXT: s_lshl_b32 s30, s30, 8 +; GFX1250-NEXT: s_lshl_b32 s23, s23, 8 +; GFX1250-NEXT: s_and_b32 s22, s22, 0xff +; GFX1250-NEXT: s_or_b32 s20, s20, s21 +; GFX1250-NEXT: s_and_b32 s1, s1, 0xff +; GFX1250-NEXT: s_lshl_b32 s21, s29, 8 +; GFX1250-NEXT: s_lshl_b32 s19, s19, 8 +; GFX1250-NEXT: s_and_b32 s18, s18, 0xff +; GFX1250-NEXT: s_lshr_b32 s8, s4, 16 +; GFX1250-NEXT: s_lshr_b32 s9, s4, 24 +; GFX1250-NEXT: s_bfe_u32 s24, s4, 0x80008 +; GFX1250-NEXT: s_bfe_u32 s25, s5, 0x80008 +; GFX1250-NEXT: s_bfe_u32 s26, s6, 0x80008 +; GFX1250-NEXT: s_or_b32 s7, s7, s17 +; GFX1250-NEXT: s_or_b32 s14, s14, s15 +; GFX1250-NEXT: s_add_co_i32 s13, s13, s13 +; GFX1250-NEXT: s_add_co_i32 s12, s12, s12 +; GFX1250-NEXT: s_add_co_i32 s11, s11, s11 +; GFX1250-NEXT: s_add_co_i32 s10, s10, s10 +; GFX1250-NEXT: s_bfe_u32 s28, s0, 0x80008 +; GFX1250-NEXT: s_or_b32 s3, s3, s30 +; GFX1250-NEXT: s_or_b32 s22, s22, s23 +; GFX1250-NEXT: s_bfe_u32 s23, s2, 0x80008 +; GFX1250-NEXT: s_or_b32 s1, s1, s21 +; GFX1250-NEXT: s_or_b32 s18, s18, s19 +; GFX1250-NEXT: s_and_b32 s7, s7, 0xffff +; GFX1250-NEXT: s_lshl_b32 s14, s14, 16 +; GFX1250-NEXT: s_add_co_i32 s6, s6, s6 +; GFX1250-NEXT: s_add_co_i32 s26, s26, s26 +; GFX1250-NEXT: s_lshl_b32 s13, s13, 8 +; GFX1250-NEXT: s_and_b32 s12, s12, 0xff +; GFX1250-NEXT: s_add_co_i32 s5, s5, s5 +; GFX1250-NEXT: s_add_co_i32 s25, s25, s25 +; GFX1250-NEXT: s_lshl_b32 s11, s11, 8 +; GFX1250-NEXT: s_and_b32 s10, s10, 0xff +; GFX1250-NEXT: s_add_co_i32 s4, s4, s4 +; GFX1250-NEXT: s_add_co_i32 s24, s24, s24 +; GFX1250-NEXT: s_add_co_i32 s9, s9, s9 +; GFX1250-NEXT: s_add_co_i32 s8, s8, s8 +; GFX1250-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1250-NEXT: s_lshl_b32 s22, s22, 16 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s2 +; GFX1250-NEXT: s_add_co_i32 s23, s23, s23 +; GFX1250-NEXT: s_and_b32 s1, s1, 0xffff +; GFX1250-NEXT: s_lshl_b32 s18, s18, 16 +; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 +; GFX1250-NEXT: s_add_co_i32 s28, s28, s28 +; GFX1250-NEXT: s_or_b32 s7, s7, s14 +; GFX1250-NEXT: s_and_b32 s6, s6, 0xff +; GFX1250-NEXT: s_lshl_b32 s14, s26, 8 +; GFX1250-NEXT: s_or_b32 s12, s12, s13 +; GFX1250-NEXT: s_and_b32 s5, s5, 0xff +; GFX1250-NEXT: s_lshl_b32 s13, s25, 8 +; GFX1250-NEXT: s_or_b32 s10, s10, s11 +; GFX1250-NEXT: s_and_b32 s4, s4, 0xff +; GFX1250-NEXT: s_lshl_b32 s11, s24, 8 +; GFX1250-NEXT: s_lshl_b32 s9, s9, 8 +; GFX1250-NEXT: s_and_b32 s8, s8, 0xff +; GFX1250-NEXT: s_or_b32 s3, s3, s22 +; GFX1250-NEXT: s_and_b32 s2, s2, 0xff +; GFX1250-NEXT: s_lshl_b32 s22, s23, 8 +; GFX1250-NEXT: s_or_b32 s1, s1, s18 +; GFX1250-NEXT: s_and_b32 s0, s0, 0xff +; GFX1250-NEXT: s_lshl_b32 s18, s28, 8 +; GFX1250-NEXT: s_or_b32 s6, s6, s14 +; GFX1250-NEXT: s_or_b32 s5, s5, s13 +; GFX1250-NEXT: s_or_b32 s4, s4, s11 +; GFX1250-NEXT: s_or_b32 s8, s8, s9 +; GFX1250-NEXT: s_or_b32 s2, s2, s22 +; GFX1250-NEXT: s_or_b32 s0, s0, s18 +; GFX1250-NEXT: s_and_b32 s6, s6, 0xffff +; GFX1250-NEXT: s_lshl_b32 s12, s12, 16 +; GFX1250-NEXT: s_and_b32 s5, s5, 0xffff +; GFX1250-NEXT: s_and_b32 s4, s4, 0xffff +; GFX1250-NEXT: s_lshl_b32 s8, s8, 16 +; GFX1250-NEXT: s_lshl_b32 s9, s10, 16 +; GFX1250-NEXT: s_and_b32 s2, s2, 0xffff +; GFX1250-NEXT: s_lshl_b32 s20, s20, 16 +; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff +; GFX1250-NEXT: s_lshl_b32 s16, s16, 16 +; GFX1250-NEXT: s_or_b32 s6, s6, s12 +; GFX1250-NEXT: s_or_b32 s4, s4, s8 +; GFX1250-NEXT: s_or_b32 s5, s5, s9 +; GFX1250-NEXT: s_or_b32 s2, s2, s20 +; GFX1250-NEXT: s_or_b32 s0, s0, s16 +; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX1250-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX1250-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX1250-NEXT: global_store_b128 v[10:11], v[4:7], off +; GFX1250-NEXT: s_endpgm entry: %add = add <32 x i8> %arg0, %arg0 store <32 x i8> %add, ptr addrspace(1) null @@ -2212,6 +2866,12 @@ define amdgpu_cs void @amdgpu_cs_i1(i1 %arg0) { ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: global_store_b8 v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amdgpu_cs_i1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: global_store_b8 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm store i1 %arg0, ptr addrspace(1) poison ret void } @@ -2330,6 +2990,56 @@ define amdgpu_cs void @amdgpu_cs_v8i1(<8 x i1> %arg0) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: global_store_b8 v[0:1], v0, off ; GFX11-FAKE16-NEXT: s_endpgm +; +; GFX1250-TRUE16-LABEL: amdgpu_cs_v8i1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.h, 3, v3.l +; GFX1250-TRUE16-NEXT: v_and_b16 v3.l, v6.l, 1 +; GFX1250-TRUE16-NEXT: v_and_b16 v2.l, v2.l, 1 +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 3, v7.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v3.h, 1, v5.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.l, 1, v1.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v3.l, 2, v3.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.l, 2, v2.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 1 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v3.l +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v4.l, v3.h, 1 bitop3:0xec +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v2.l +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.h, v1.h, v0.h, 3 bitop3:0xec +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v2.l, 3 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v0.h, 4, v0.h +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 15 bitop3:0xec +; GFX1250-TRUE16-NEXT: flat_store_b8 v[0:1], v0 +; GFX1250-TRUE16-NEXT: s_endpgm +; +; GFX1250-FAKE16-LABEL: amdgpu_cs_v8i1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 3, v7 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v6, 2, v6 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v2, 2, v2 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v1, 1, v1 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v4, v4, v5, 1 bitop3:0xec +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 1 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v1, v4, v6, 3 bitop3:0xec +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v2, 3 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v1, 4, v1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 15 bitop3:0xec +; GFX1250-FAKE16-NEXT: global_store_b8 v[0:1], v0, off +; GFX1250-FAKE16-NEXT: s_endpgm store <8 x i1> %arg0, ptr addrspace(1) poison ret void } @@ -2545,6 +3255,94 @@ define amdgpu_cs void @amdgpu_cs_v16i1(<16 x i1> %arg0) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v0, off ; GFX11-FAKE16-NEXT: s_endpgm +; +; GFX1250-TRUE16-LABEL: amdgpu_cs_v16i1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: v_and_b16 v2.h, v6.l, 1 +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 3, v7.l +; GFX1250-TRUE16-NEXT: v_and_b16 v4.h, v10.l, 1 +; GFX1250-TRUE16-NEXT: v_and_b16 v2.l, v2.l, 1 +; GFX1250-TRUE16-NEXT: v_and_b16 v3.h, v8.l, 1 +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.h, 2, v2.h +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v5.h, 3, v11.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v4.h, 2, v4.h +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v6.l, 3, v15.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v6.h, 1, v13.l +; GFX1250-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v2.h +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 1, v5.l +; GFX1250-TRUE16-NEXT: v_and_b16 v5.l, v14.l, 1 +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.h, 1, v9.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v3.l, 3, v3.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.l, 2, v2.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.l, 1, v1.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v5.l, 2, v5.l +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v4.l, v1.h, 1 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v2.h, v3.h, 3, v2.h bitop3:0xc8 +; GFX1250-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v4.h +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v4.h, v12.l, v6.h, 1 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_or_b16 v4.l, v6.l, v5.l +; GFX1250-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 1 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.h, v1.h, v0.h, 3 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.l, v2.h, 15, v3.h bitop3:0xc8 +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v4.h, v4.l, 3 bitop3:0xec +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v2.l, 3 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v0.h, 4, v0.h +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 12, v1.h +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 15 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v1.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 0xff bitop3:0xec +; GFX1250-TRUE16-NEXT: flat_store_b16 v[0:1], v0 +; GFX1250-TRUE16-NEXT: s_endpgm +; +; GFX1250-FAKE16-LABEL: amdgpu_cs_v16i1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 3, v7 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v9, 1, v9 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v6, 2, v6 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v11, 3, v11 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v13, 1, v13 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v7, 1, v10 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v10, 1, v14 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v14, 3, v15 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v2, 2, v2 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v1, 1, v1 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 2, v7 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v10, 2, v10 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v4, v4, v5, 1 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v5, v8, 3, v9 bitop3:0xc8 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v9, v12, v13, 1 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v7, v11, v7 +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v8, v14, v10 +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 1 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v1, v4, v6, 3 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v3, v5, 15, v7 bitop3:0xc8 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v4, v9, v8, 3 bitop3:0xec +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v2, 3 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v1, 4, v1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v2, 8, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v3, 12, v4 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 15 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 0xff bitop3:0xec +; GFX1250-FAKE16-NEXT: global_store_b16 v[0:1], v0, off +; GFX1250-FAKE16-NEXT: s_endpgm store <16 x i1> %arg0, ptr addrspace(1) poison ret void } @@ -2945,6 +3743,170 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-FAKE16-NEXT: s_endpgm +; +; GFX1250-TRUE16-LABEL: amdgpu_cs_v32i1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v18.h, 3, v19.l +; GFX1250-TRUE16-NEXT: v_and_b16 v19.l, v22.l, 1 +; GFX1250-TRUE16-NEXT: v_and_b16 v18.l, v18.l, 1 +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v17.h, 3, v23.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v19.h, 1, v21.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v17.l, 1, v17.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v19.l, 2, v19.l +; GFX1250-TRUE16-NEXT: v_and_b16 v3.h, v6.l, 1 +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v18.l, 2, v18.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 3, v7.l +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v16.l, v16.l, v17.l, 1 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_or_b16 v16.h, v17.h, v19.l +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v17.h, v20.l, v19.h, 1 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_and_b16 v17.l, v26.l, 1 +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v3.h, 2, v3.h +; GFX1250-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v18.l +; GFX1250-TRUE16-NEXT: v_and_b16 v18.h, v24.l, 1 +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v16.h, v17.h, v16.h, 3 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v17.h, 1, v25.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v19.l, 3, v27.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v17.l, 2, v17.l +; GFX1250-TRUE16-NEXT: v_and_b16 v19.h, v30.l, 1 +; GFX1250-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v3.h +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 1, v5.l +; GFX1250-TRUE16-NEXT: v_and_b16 v5.l, v10.l, 1 +; GFX1250-TRUE16-NEXT: v_and_b16 v5.h, v14.l, 1 +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v17.h, v18.h, 3, v17.h bitop3:0xc8 +; GFX1250-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v17.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v18.h, 3, v31.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v19.l, 2, v19.h +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v19.h, 1, v29.l +; GFX1250-TRUE16-NEXT: v_and_b16 v2.l, v2.l, 1 +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v3.h, 1, v9.l +; GFX1250-TRUE16-NEXT: v_and_b16 v4.h, v8.l, 1 +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v6.l, 3, v11.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v5.l, 2, v5.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v6.h, 3, v15.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v5.h, 2, v5.h +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v7.l, 1, v13.l +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v17.l, v17.h, 15, v17.l bitop3:0xc8 +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v16.l, v16.l, v18.l, 3 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_or_b16 v17.h, v18.h, v19.l +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v18.l, v28.l, v19.h, 1 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v3.l, 3, v3.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.l, 2, v2.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.l, 1, v1.l +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v4.l, v1.h, 1 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v3.h, v4.h, 3, v3.h bitop3:0xc8 +; GFX1250-TRUE16-NEXT: v_or_b16 v4.l, v6.l, v5.l +; GFX1250-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v5.h +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v5.l, v12.l, v7.l, 1 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v2.h, v18.l, v17.h, 3 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 1 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.h, v1.h, v0.h, 3 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.l, v3.h, 15, v4.l bitop3:0xc8 +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v5.l, v4.h, 3 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v16.h, 4, v16.h +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.h, 12, v2.h +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v2.l, 3 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v0.h, 4, v0.h +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 12, v1.h +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v2.l, v16.l, v16.h, 15 bitop3:0xec +; GFX1250-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v17.l +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 15 bitop3:0xec +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1250-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v1.l +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v2.l, v2.h, 0xff bitop3:0xec +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 0xff bitop3:0xec +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX1250-TRUE16-NEXT: global_store_b32 v[0:1], v0, off +; GFX1250-TRUE16-NEXT: s_endpgm +; +; GFX1250-FAKE16-LABEL: amdgpu_cs_v32i1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 3, v7 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v2, 2, v2 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v6, 2, v6 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v1, 1, v1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v4, v4, v5, 1 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v3, v7, v6 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v6, 1, v10 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v5, 1, v9 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v7, 1, v8 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v8, 3, v11 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v3, v4, v3, 3 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v6, 2, v6 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 1 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v4, v7, 3, v5 bitop3:0xc8 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v1, 1, v14 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 3, v23 +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v5, v8, v6 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v6, 3, v15 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v2, 3 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v1, 2, v1 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v2, 1, v13 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v4, v4, 15, v5 bitop3:0xc8 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 1, v22 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v9, 1, v26 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v11, 1, v30 +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v2, v12, v2, 1 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v5, 2, v5 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v6, 1, v18 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v10, 1, v25 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v12, 1, v24 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v13, 3, v27 +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 1, v21 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v9, 2, v9 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v14, 3, v31 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v11, 2, v11 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v15, 1, v29 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v1, v2, v1, 3 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v2, 3, v19 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v6, 2, v6 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v8, 1, v17 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v7, v20, v7, 1 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v10, v12, 3, v10 bitop3:0xc8 +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v11, v14, v11 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v12, v28, v15, 1 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v6, v16, v8, 1 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v5, v7, v5, 3 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v7, v10, 15, v9 bitop3:0xc8 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v8, v12, v11, 3 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v3, 4, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v4, 8, v4 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v2, v6, v2, 3 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v5, 4, v5 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v7 +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 12, v8 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v3, 15 bitop3:0xec +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v2, v2, v5, 15 bitop3:0xec +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v3, v7, v6 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 0xff bitop3:0xec +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v1, v2, v3, 0xff bitop3:0xec +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX1250-FAKE16-NEXT: global_store_b32 v[0:1], v0, off +; GFX1250-FAKE16-NEXT: s_endpgm store <32 x i1> %arg0, ptr addrspace(1) poison ret void } @@ -2973,6 +3935,14 @@ define amdgpu_cs void @amdgpu_cs_inreg_i1(i1 inreg %arg0) { ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b8 v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amdgpu_cs_inreg_i1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_and_b32 s0, s0, 1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: global_store_b8 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm store i1 %arg0, ptr addrspace(1) poison ret void } @@ -3061,6 +4031,34 @@ define amdgpu_cs void @amdgpu_cs_inreg_v8i1(<8 x i1> inreg %arg0) { ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b8 v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amdgpu_cs_inreg_v8i1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_and_b32 s6, s6, 1 +; GFX1250-NEXT: s_lshl_b32 s5, s5, 1 +; GFX1250-NEXT: s_and_b32 s4, s4, 1 +; GFX1250-NEXT: s_and_b32 s2, s2, 1 +; GFX1250-NEXT: s_lshl_b32 s1, s1, 1 +; GFX1250-NEXT: s_and_b32 s0, s0, 1 +; GFX1250-NEXT: s_lshl_b32 s7, s7, 3 +; GFX1250-NEXT: s_lshl_b32 s6, s6, 2 +; GFX1250-NEXT: s_or_b32 s4, s4, s5 +; GFX1250-NEXT: s_lshl_b32 s3, s3, 3 +; GFX1250-NEXT: s_lshl_b32 s2, s2, 2 +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: s_or_b32 s5, s7, s6 +; GFX1250-NEXT: s_and_b32 s4, s4, 3 +; GFX1250-NEXT: s_or_b32 s1, s3, s2 +; GFX1250-NEXT: s_and_b32 s0, s0, 3 +; GFX1250-NEXT: s_or_b32 s2, s4, s5 +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: s_lshl_b32 s1, s2, 4 +; GFX1250-NEXT: s_and_b32 s0, s0, 15 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: global_store_b8 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm store <8 x i1> %arg0, ptr addrspace(1) poison ret void } @@ -3221,6 +4219,58 @@ define amdgpu_cs void @amdgpu_cs_inreg_v16i1(<16 x i1> inreg %arg0) { ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amdgpu_cs_inreg_v16i1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_and_b32 s10, s10, 1 +; GFX1250-NEXT: s_lshl_b32 s9, s9, 1 +; GFX1250-NEXT: s_and_b32 s8, s8, 1 +; GFX1250-NEXT: s_and_b32 s6, s6, 1 +; GFX1250-NEXT: s_lshl_b32 s5, s5, 1 +; GFX1250-NEXT: s_and_b32 s4, s4, 1 +; GFX1250-NEXT: s_and_b32 s2, s2, 1 +; GFX1250-NEXT: s_lshl_b32 s1, s1, 1 +; GFX1250-NEXT: s_and_b32 s0, s0, 1 +; GFX1250-NEXT: s_and_b32 s14, s14, 1 +; GFX1250-NEXT: s_lshl_b32 s13, s13, 1 +; GFX1250-NEXT: s_and_b32 s12, s12, 1 +; GFX1250-NEXT: s_lshl_b32 s11, s11, 3 +; GFX1250-NEXT: s_lshl_b32 s10, s10, 2 +; GFX1250-NEXT: s_or_b32 s8, s8, s9 +; GFX1250-NEXT: s_lshl_b32 s7, s7, 3 +; GFX1250-NEXT: s_lshl_b32 s6, s6, 2 +; GFX1250-NEXT: s_or_b32 s4, s4, s5 +; GFX1250-NEXT: s_lshl_b32 s3, s3, 3 +; GFX1250-NEXT: s_lshl_b32 s2, s2, 2 +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: s_lshl_b32 s15, s15, 3 +; GFX1250-NEXT: s_lshl_b32 s14, s14, 2 +; GFX1250-NEXT: s_or_b32 s12, s12, s13 +; GFX1250-NEXT: s_or_b32 s9, s11, s10 +; GFX1250-NEXT: s_and_b32 s8, s8, 3 +; GFX1250-NEXT: s_or_b32 s5, s7, s6 +; GFX1250-NEXT: s_and_b32 s4, s4, 3 +; GFX1250-NEXT: s_or_b32 s1, s3, s2 +; GFX1250-NEXT: s_and_b32 s0, s0, 3 +; GFX1250-NEXT: s_or_b32 s13, s15, s14 +; GFX1250-NEXT: s_and_b32 s12, s12, 3 +; GFX1250-NEXT: s_or_b32 s8, s8, s9 +; GFX1250-NEXT: s_or_b32 s2, s4, s5 +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: s_or_b32 s10, s12, s13 +; GFX1250-NEXT: s_and_b32 s8, s8, 15 +; GFX1250-NEXT: s_lshl_b32 s1, s2, 4 +; GFX1250-NEXT: s_and_b32 s0, s0, 15 +; GFX1250-NEXT: s_lshl_b32 s9, s10, 12 +; GFX1250-NEXT: s_lshl_b32 s2, s8, 8 +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: s_or_b32 s1, s9, s2 +; GFX1250-NEXT: s_and_b32 s0, s0, 0xff +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: global_store_b16 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm store <16 x i1> %arg0, ptr addrspace(1) poison ret void } @@ -3525,6 +4575,106 @@ define amdgpu_cs void @amdgpu_cs_inreg_v32i1(<32 x i1> inreg %arg0) { ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amdgpu_cs_inreg_v32i1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_and_b32 s10, s10, 1 +; GFX1250-NEXT: s_lshl_b32 s9, s9, 1 +; GFX1250-NEXT: s_and_b32 s8, s8, 1 +; GFX1250-NEXT: s_and_b32 s14, s14, 1 +; GFX1250-NEXT: s_lshl_b32 s13, s13, 1 +; GFX1250-NEXT: s_and_b32 s12, s12, 1 +; GFX1250-NEXT: s_lshl_b32 s11, s11, 3 +; GFX1250-NEXT: s_lshl_b32 s10, s10, 2 +; GFX1250-NEXT: s_or_b32 s8, s8, s9 +; GFX1250-NEXT: s_and_b32 s6, s6, 1 +; GFX1250-NEXT: s_lshl_b32 s5, s5, 1 +; GFX1250-NEXT: s_and_b32 s4, s4, 1 +; GFX1250-NEXT: s_and_b32 s2, s2, 1 +; GFX1250-NEXT: s_lshl_b32 s1, s1, 1 +; GFX1250-NEXT: s_and_b32 s0, s0, 1 +; GFX1250-NEXT: s_lshl_b32 s15, s15, 3 +; GFX1250-NEXT: s_lshl_b32 s14, s14, 2 +; GFX1250-NEXT: s_or_b32 s12, s12, s13 +; GFX1250-NEXT: s_or_b32 s9, s11, s10 +; GFX1250-NEXT: s_and_b32 s8, s8, 3 +; GFX1250-NEXT: s_lshl_b32 s7, s7, 3 +; GFX1250-NEXT: s_lshl_b32 s6, s6, 2 +; GFX1250-NEXT: s_or_b32 s4, s4, s5 +; GFX1250-NEXT: s_lshl_b32 s3, s3, 3 +; GFX1250-NEXT: s_lshl_b32 s2, s2, 2 +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: s_or_b32 s13, s15, s14 +; GFX1250-NEXT: s_and_b32 s12, s12, 3 +; GFX1250-NEXT: s_or_b32 s8, s8, s9 +; GFX1250-NEXT: s_or_b32 s5, s7, s6 +; GFX1250-NEXT: s_and_b32 s4, s4, 3 +; GFX1250-NEXT: s_or_b32 s1, s3, s2 +; GFX1250-NEXT: s_and_b32 s0, s0, 3 +; GFX1250-NEXT: s_or_b32 s10, s12, s13 +; GFX1250-NEXT: s_and_b32 s8, s8, 15 +; GFX1250-NEXT: s_or_b32 s2, s4, s5 +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: s_lshl_b32 s9, s10, 12 +; GFX1250-NEXT: s_lshl_b32 s1, s2, 4 +; GFX1250-NEXT: s_and_b32 s0, s0, 15 +; GFX1250-NEXT: s_lshl_b32 s2, s8, 8 +; GFX1250-NEXT: s_and_b32 s3, s30, 1 +; GFX1250-NEXT: s_lshl_b32 s4, s29, 1 +; GFX1250-NEXT: s_and_b32 s5, s28, 1 +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: s_or_b32 s1, s9, s2 +; GFX1250-NEXT: s_lshl_b32 s2, s31, 3 +; GFX1250-NEXT: s_lshl_b32 s3, s3, 2 +; GFX1250-NEXT: s_or_b32 s4, s5, s4 +; GFX1250-NEXT: s_and_b32 s5, s26, 1 +; GFX1250-NEXT: s_lshl_b32 s6, s25, 1 +; GFX1250-NEXT: s_and_b32 s7, s24, 1 +; GFX1250-NEXT: s_or_b32 s2, s2, s3 +; GFX1250-NEXT: s_and_b32 s3, s4, 3 +; GFX1250-NEXT: s_lshl_b32 s4, s27, 3 +; GFX1250-NEXT: s_lshl_b32 s5, s5, 2 +; GFX1250-NEXT: s_or_b32 s6, s7, s6 +; GFX1250-NEXT: s_or_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s5, s6, 3 +; GFX1250-NEXT: s_or_b32 s2, s3, s2 +; GFX1250-NEXT: s_or_b32 s3, s5, s4 +; GFX1250-NEXT: s_and_b32 s5, s22, 1 +; GFX1250-NEXT: s_lshl_b32 s6, s21, 1 +; GFX1250-NEXT: s_and_b32 s7, s20, 1 +; GFX1250-NEXT: s_lshl_b32 s4, s23, 3 +; GFX1250-NEXT: s_lshl_b32 s5, s5, 2 +; GFX1250-NEXT: s_or_b32 s6, s7, s6 +; GFX1250-NEXT: s_and_b32 s7, s18, 1 +; GFX1250-NEXT: s_lshl_b32 s8, s17, 1 +; GFX1250-NEXT: s_and_b32 s9, s16, 1 +; GFX1250-NEXT: s_or_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s5, s6, 3 +; GFX1250-NEXT: s_lshl_b32 s6, s19, 3 +; GFX1250-NEXT: s_lshl_b32 s7, s7, 2 +; GFX1250-NEXT: s_or_b32 s8, s9, s8 +; GFX1250-NEXT: s_or_b32 s6, s6, s7 +; GFX1250-NEXT: s_and_b32 s7, s8, 3 +; GFX1250-NEXT: s_or_b32 s4, s5, s4 +; GFX1250-NEXT: s_or_b32 s5, s7, s6 +; GFX1250-NEXT: s_and_b32 s3, s3, 15 +; GFX1250-NEXT: s_lshl_b32 s4, s4, 4 +; GFX1250-NEXT: s_and_b32 s5, s5, 15 +; GFX1250-NEXT: s_lshl_b32 s2, s2, 12 +; GFX1250-NEXT: s_lshl_b32 s3, s3, 8 +; GFX1250-NEXT: s_or_b32 s4, s5, s4 +; GFX1250-NEXT: s_and_b32 s0, s0, 0xff +; GFX1250-NEXT: s_or_b32 s2, s2, s3 +; GFX1250-NEXT: s_and_b32 s3, s4, 0xff +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: s_or_b32 s1, s3, s2 +; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff +; GFX1250-NEXT: s_lshl_b32 s1, s1, 16 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: global_store_b32 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm store <32 x i1> %arg0, ptr addrspace(1) poison ret void } @@ -3549,6 +4699,12 @@ define amdgpu_cs void @amdgpu_cs_i1_sext(i1 signext %arg0) { ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: global_store_b8 v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amdgpu_cs_i1_sext: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: global_store_b8 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm store i1 %arg0, ptr addrspace(1) poison ret void } @@ -3570,6 +4726,11 @@ define amdgpu_cs void @amdgpu_cs_i1_zext(i1 zeroext %arg0) { ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b8 v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: amdgpu_cs_i1_zext: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_store_b8 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm store i1 %arg0, ptr addrspace(1) poison ret void } |
