diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll | 2223 |
1 files changed, 2223 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll new file mode 100644 index 000000000000..f54fbbaabe9f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -0,0 +1,2223 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +; Test using saddr addressing mode of flat_*load_* instructions. + +; -------------------------------------------------------------------------------- +; No vgpr offset, constants +; -------------------------------------------------------------------------------- + +; SGPR base only +define amdgpu_ps float @flat_load_saddr_i8_offset_0(ptr inreg %sbase) { +; GFX1250-LABEL: flat_load_saddr_i8_offset_0: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %load = load i8, ptr %sbase + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum gfx1250 immediate offset +define amdgpu_ps float @flat_load_saddr_i8_offset_8388607(ptr inreg %sbase) { +; GFX1250-LABEL: flat_load_saddr_i8_offset_8388607: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 8388607 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum gfx1250 immediate offset + 1 +define amdgpu_ps float @flat_load_saddr_i8_offset_8388608(ptr inreg %sbase) { +; GFX1250-LABEL: flat_load_saddr_i8_offset_8388608: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 8388608 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum negative gfx1250 immediate offset +define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388608(ptr inreg %sbase) { +; GFX1250-LABEL: flat_load_saddr_i8_offset_neg8388608: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-8388608 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -8388608 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum negative gfx1250 immediate offset -1 +define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg8388609: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0xff800000, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg8388609: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0xff7fffff +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -8388609 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) { +; GFX1250-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, 0xff800000 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967295 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000000(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000000: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, 1 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000000: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967296 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000001(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000001: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000001: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 1 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967297 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000FFF(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000FFF: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000FFF: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294971391 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_0x100001000(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100001000: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100001000: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294971392 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_neg0xFFFFFFFF(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800000, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8388607 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 1 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -4294967295 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000000(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0x100000000: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, -1 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0x100000000: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -4294967296 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000001(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0x100000001: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0x100000001: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, -1 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -4294967297 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; -------------------------------------------------------------------------------- +; Basic addressing patterns +; -------------------------------------------------------------------------------- + +; Basic pattern, no immediate offset. +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum positive offset on gfx1250 +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388607: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 8388607 + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum positive offset on gfx1250 + 1 +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388608: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388608: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 8388608 + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum negative offset on gfx1250 +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388608(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_neg8388608: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-8388608 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -8388608 + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum negative offset on gfx1250 - 1 +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388607(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_neg8388607: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-8388607 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -8388607 + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607_gep_order(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388607_gep_order: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 8388607 + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 %zext.offset + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; pointer addressing done in integers +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %sbase.as.int = ptrtoint ptr %sbase to i64 + %add = add i64 %sbase.as.int, %zext.offset + %dirty.gep = inttoptr i64 %add to ptr + %load = load i8, ptr %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; zext forced to LHS of addressing expression +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %sbase.as.int = ptrtoint ptr %sbase to i64 + %add = add i64 %zext.offset, %sbase.as.int + %dirty.gep = inttoptr i64 %add to ptr + %load = load i8, ptr %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; zext forced to LHS of addressing expression, with immediate offset +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %sbase.as.int = ptrtoint ptr %sbase to i64 + %add = add i64 %zext.offset, %sbase.as.int + %add.immoffset = add i64 %add, 128 + %dirty.gep = inttoptr i64 %add.immoffset to ptr + %load = load i8, ptr %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; zext forced to LHS of addressing expression, with immediate offset in non-canonical position +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %sbase.as.int = ptrtoint ptr %sbase to i64 + %add.immoffset = add i64 %sbase.as.int, 128 + %add = add i64 %zext.offset, %add.immoffset + %dirty.gep = inttoptr i64 %add to ptr + %load = load i8, ptr %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; -------------------------------------------------------------------------------- +; Uniformity edge cases +; -------------------------------------------------------------------------------- + +@ptr.in.lds = internal addrspace(3) global ptr undef + +; Base pointer is uniform, but also in VGPRs +define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_uniform_ptr_in_vgprs: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v1 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_uniform_ptr_in_vgprs: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v1 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %sbase = load ptr, ptr addrspace(3) @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Base pointer is uniform, but also in VGPRs, with imm offset +define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v1 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[0:1] offset:42 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v1 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:42 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %sbase = load ptr, ptr addrspace(3) @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 42 + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Both 64-bit base and 32-bit offset are scalar +define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, i32 inreg %soffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_uniform_offset: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Both 64-bit base and 32-bit offset are scalar, with immediate offset. +define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inreg %sbase, i32 inreg %soffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-24 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -24 + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Both components uniform, zext forced to LHS of addressing expression +define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 inreg %soffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %sbase.as.int = ptrtoint ptr %sbase to i64 + %add = add i64 %zext.offset, %sbase.as.int + %dirty.gep = inttoptr i64 %add to ptr + %load = load i8, ptr %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Both components uniform, zext forced to LHS of addressing expression, with immediate offset +define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 inreg %soffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %sbase.as.int = ptrtoint ptr %sbase to i64 + %add = add i64 %zext.offset, %sbase.as.int + %add.immoffset = add i64 %add, 128 + %dirty.gep = inttoptr i64 %add.immoffset to ptr + %load = load i8, ptr %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; divergent 64-bit base, 32-bit scalar offset. +define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffset) { +; GFX1250-SDAG-LABEL: flat_load_i8_vgpr64_sgpr32: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_mov_b32 s3, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_i8_vgpr64_sgpr32: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_mov_b32 s3, 0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %gep0 = getelementptr inbounds i8, ptr %vbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; divergent 64-bit base, 32-bit scalar offset, with imm offset +define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i32 inreg %soffset) { +; GFX1250-SDAG-LABEL: flat_load_i8_vgpr64_sgpr32_offset_8388607: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_mov_b32 s3, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388607 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_i8_vgpr64_sgpr32_offset_8388607: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_mov_b32 s3, 0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:8388607 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %gep0 = getelementptr inbounds i8, ptr %vbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 8388607 + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; -------------------------------------------------------------------------------- +; Natural addressing shifts with restricted range +; -------------------------------------------------------------------------------- + +; Cannot push the shift into 32-bits, and cannot match. +define amdgpu_ps float @flat_load_saddr_f32_natural_addressing(ptr inreg %sbase, ptr %voffset.ptr) { +; GFX1250-SDAG-LABEL: flat_load_saddr_f32_natural_addressing: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3] +; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_f32_natural_addressing: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %voffset = load i32, ptr %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset + %load = load float, ptr %gep + ret float %load +} + +; Cannot push the shift into 32-bits, with an immediate offset. +define amdgpu_ps float @flat_load_saddr_f32_natural_addressing_immoffset(ptr inreg %sbase, ptr %voffset.ptr) { +; GFX1250-LABEL: flat_load_saddr_f32_natural_addressing_immoffset: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %voffset = load i32, ptr %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 128 + %load = load float, ptr %gep1 + ret float %load +} + +; Range is sufficiently restricted to push the shift into 32-bits. +define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range(ptr inreg %sbase, ptr %voffset.ptr) { +; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{} + %zext.offset = zext i32 %voffset to i64 + %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset + %load = load float, ptr %gep + ret float %load +} + +; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset +define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg %sbase, ptr %voffset.ptr) { +; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range_imm_offset: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:400 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{} + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds float, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds float, ptr %gep0, i64 100 + %load = load float, ptr %gep1 + ret float %load +} + +; Range is 1 beyond the limit where we can move the shift into 32-bits. +define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_too_large(ptr inreg %sbase, ptr %voffset.ptr) { +; GFX1250-SDAG-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3] +; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %voffset = load i32, ptr %voffset.ptr, !range !1, !noundef !{} + %zext.offset = zext i32 %voffset to i64 + %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset + %load = load float, ptr %gep + ret float %load +} + +; -------------------------------------------------------------------------------- +; Stress various type loads +; -------------------------------------------------------------------------------- + +define amdgpu_ps half @flat_load_saddr_i16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %cast.load = bitcast i16 %load to half + ret half %cast.load +} + +define amdgpu_ps half @flat_load_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %cast.load = bitcast i16 %load to half + ret half %cast.load +} + +define amdgpu_ps half @flat_load_saddr_f16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load half, ptr %gep0 + ret half %load +} + +define amdgpu_ps half @flat_load_saddr_f16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_f16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load half, ptr %gep1 + ret half %load +} + +define amdgpu_ps float @flat_load_saddr_i32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i32, ptr %gep0 + %cast.load = bitcast i32 %load to float + ret float %cast.load +} + +define amdgpu_ps float @flat_load_saddr_i32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i32, ptr %gep1 + %cast.load = bitcast i32 %load to float + ret float %cast.load +} + +define amdgpu_ps float @flat_load_saddr_f32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load float, ptr %gep0 + ret float %load +} + +define amdgpu_ps float @flat_load_saddr_f32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_f32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load float, ptr %gep1 + ret float %load +} + +define amdgpu_ps <2 x half> @flat_load_saddr_v2i16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <2 x i16>, ptr %gep0 + %cast.load = bitcast <2 x i16> %load to <2 x half> + ret <2 x half> %cast.load +} + +define amdgpu_ps <2 x half> @flat_load_saddr_v2i16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2i16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <2 x i16>, ptr %gep1 + %cast.load = bitcast <2 x i16> %load to <2 x half> + ret <2 x half> %cast.load +} + +define amdgpu_ps <2 x half> @flat_load_saddr_v2f16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <2 x half>, ptr %gep0 + ret <2 x half> %load +} + +define amdgpu_ps <2 x half> @flat_load_saddr_v2f16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2f16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <2 x half>, ptr %gep1 + ret <2 x half> %load +} + +define amdgpu_ps <2 x half> @flat_load_saddr_p3(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_p3: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load ptr addrspace(3), ptr %gep0 + %cast.load0 = ptrtoint ptr addrspace(3) %load to i32 + %cast.load1 = bitcast i32 %cast.load0 to <2 x half> + ret <2 x half> %cast.load1 +} + +define amdgpu_ps <2 x half> @flat_load_saddr_p3_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_p3_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load ptr addrspace(3), ptr %gep1 + %cast.load0 = ptrtoint ptr addrspace(3) %load to i32 + %cast.load1 = bitcast i32 %cast.load0 to <2 x half> + ret <2 x half> %cast.load1 +} + +define amdgpu_ps <2 x float> @flat_load_saddr_f64(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load double, ptr %gep0 + %cast.load = bitcast double %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_f64_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_f64_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load double, ptr %gep1 + %cast.load = bitcast double %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i64, ptr %gep0 + %cast.load = bitcast i64 %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_i64_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i64_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i64, ptr %gep1 + %cast.load = bitcast i64 %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v2f32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <2 x float>, ptr %gep0 + ret <2 x float> %load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v2f32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2f32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <2 x float>, ptr %gep1 + ret <2 x float> %load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v2i32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <2 x i32>, ptr %gep0 + %cast.load = bitcast <2 x i32> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v2i32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2i32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <2 x i32>, ptr %gep1 + %cast.load = bitcast <2 x i32> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v4i16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <4 x i16>, ptr %gep0 + %cast.load = bitcast <4 x i16> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v4i16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4i16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <4 x i16>, ptr %gep1 + %cast.load = bitcast <4 x i16> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v4f16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <4 x half>, ptr %gep0 + %cast.load = bitcast <4 x half> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v4f16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4f16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <4 x half>, ptr %gep1 + %cast.load = bitcast <4 x half> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_p1(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_p1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load ptr, ptr %gep0 + %cast.load0 = ptrtoint ptr %load to i64 + %cast.load1 = bitcast i64 %cast.load0 to <2 x float> + ret <2 x float> %cast.load1 +} + +define amdgpu_ps <2 x float> @flat_load_saddr_p1_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_p1_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load ptr, ptr %gep1 + %cast.load0 = ptrtoint ptr %load to i64 + %cast.load1 = bitcast i64 %cast.load0 to <2 x float> + ret <2 x float> %cast.load1 +} + +define amdgpu_ps <3 x float> @flat_load_saddr_v3f32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v3f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <3 x float>, ptr %gep0 + ret <3 x float> %load +} + +define amdgpu_ps <3 x float> @flat_load_saddr_v3f32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v3f32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <3 x float>, ptr %gep1 + ret <3 x float> %load +} + +define amdgpu_ps <3 x float> @flat_load_saddr_v3i32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v3i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <3 x i32>, ptr %gep0 + %cast.load = bitcast <3 x i32> %load to <3 x float> + ret <3 x float> %cast.load +} + +define amdgpu_ps <3 x float> @flat_load_saddr_v3i32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v3i32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <3 x i32>, ptr %gep1 + %cast.load = bitcast <3 x i32> %load to <3 x float> + ret <3 x float> %cast.load +} + +define amdgpu_ps <6 x half> @flat_load_saddr_v6f16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v6f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <6 x half>, ptr %gep0 + ret <6 x half> %load +} + +define amdgpu_ps <6 x half> @flat_load_saddr_v6f16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v6f16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <6 x half>, ptr %gep1 + ret <6 x half> %load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v4f32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <4 x float>, ptr %gep0 + ret <4 x float> %load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v4f32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4f32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <4 x float>, ptr %gep1 + ret <4 x float> %load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v4i32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <4 x i32>, ptr %gep0 + %cast.load = bitcast <4 x i32> %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v4i32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4i32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <4 x i32>, ptr %gep1 + %cast.load = bitcast <4 x i32> %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v2i64(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <2 x i64>, ptr %gep0 + %cast.load = bitcast <2 x i64> %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v2i64_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2i64_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <2 x i64>, ptr %gep1 + %cast.load = bitcast <2 x i64> %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_i128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i128, ptr %gep0 + %cast.load = bitcast i128 %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_i128_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i128_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i128, ptr %gep1 + %cast.load = bitcast i128 %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v2p1(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2p1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <2 x ptr>, ptr %gep0 + %cast.load0 = ptrtoint <2 x ptr> %load to <2 x i64> + %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float> + ret <4 x float> %cast.load1 +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v2p1_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2p1_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <2 x ptr>, ptr %gep1 + %cast.load0 = ptrtoint <2 x ptr> %load to <2 x i64> + %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float> + ret <4 x float> %cast.load1 +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v4p3(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4p3: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <4 x ptr addrspace(3)>, ptr %gep0 + %cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32> + %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float> + ret <4 x float> %cast.load1 +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v4p3_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4p3_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <4 x ptr addrspace(3)>, ptr %gep1 + %cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32> + %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float> + ret <4 x float> %cast.load1 +} + +; -------------------------------------------------------------------------------- +; Extending loads +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @flat_sextload_saddr_i8(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_sextload_saddr_i8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %sextload = sext i8 %load to i32 + %cast.load = bitcast i32 %sextload to float + ret float %cast.load +} + +define amdgpu_ps float @flat_sextload_saddr_i8_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_sextload_saddr_i8_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i8, ptr %gep1 + %sextload = sext i8 %load to i32 + %cast.load = bitcast i32 %sextload to float + ret float %cast.load +} + +define amdgpu_ps float @flat_sextload_saddr_i16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_sextload_saddr_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_i16 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %sextload = sext i16 %load to i32 + %cast.load = bitcast i32 %sextload to float + ret float %cast.load +} + +define amdgpu_ps float @flat_sextload_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_sextload_saddr_i16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_i16 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %sextload = sext i16 %load to i32 + %cast.load = bitcast i32 %sextload to float + ret float %cast.load +} + +define amdgpu_ps float @flat_zextload_saddr_i8(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_zextload_saddr_i8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %zextload = zext i8 %load to i32 + %cast.load = bitcast i32 %zextload to float + ret float %cast.load +} + +define amdgpu_ps float @flat_zextload_saddr_i8_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_zextload_saddr_i8_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i8, ptr %gep1 + %zextload = zext i8 %load to i32 + %cast.load = bitcast i32 %zextload to float + ret float %cast.load +} + +define amdgpu_ps float @flat_zextload_saddr_i16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_zextload_saddr_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %zextload = zext i16 %load to i32 + %cast.load = bitcast i32 %zextload to float + ret float %cast.load +} + +define amdgpu_ps float @flat_zextload_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_zextload_saddr_i16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %zextload = zext i16 %load to i32 + %cast.load = bitcast i32 %zextload to float + ret float %cast.load +} + +; -------------------------------------------------------------------------------- +; Atomic load +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @atomic_flat_load_saddr_i32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: atomic_flat_load_saddr_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load atomic i32, ptr %gep0 seq_cst, align 4 + %cast.load = bitcast i32 %load to float + ret float %cast.load +} + +define amdgpu_ps float @atomic_flat_load_saddr_i32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: atomic_flat_load_saddr_i32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load atomic i32, ptr %gep1 seq_cst, align 4 + %cast.load = bitcast i32 %load to float + ret float %cast.load +} + +define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: atomic_flat_load_saddr_i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load atomic i64, ptr %gep0 seq_cst, align 8 + %cast.load = bitcast i64 %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: atomic_flat_load_saddr_i64_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load atomic i64, ptr %gep1 seq_cst, align 8 + %cast.load = bitcast i64 %load to <2 x float> + ret <2 x float> %cast.load +} + +; -------------------------------------------------------------------------------- +; D16 load (low 16) +; -------------------------------------------------------------------------------- + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16_d16lo_undef_hi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %build = insertelement <2 x i16> undef, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16_d16lo_undef_hi_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %build = insertelement <2 x i16> undef, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16_d16lo_zero_hi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_reg_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_reg_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %build = insertelement <2 x i16> %reg, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %build = insertelement <2 x i16> %reg, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %zext.load = zext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i8, ptr %gep1 + %zext.load = zext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %sext.load = sext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i8, ptr %gep1 + %sext.load = sext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +; -------------------------------------------------------------------------------- +; D16 hi load (hi16) +; -------------------------------------------------------------------------------- + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16_d16hi_undef_hi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %build = insertelement <2 x i16> undef, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %build = insertelement <2 x i16> undef, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_reg_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_reg_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %build = insertelement <2 x i16> %reg, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %build = insertelement <2 x i16> %reg, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %zext.load = zext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i8, ptr %gep1 + %zext.load = zext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %sext.load = sext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i8, ptr %gep1 + %sext.load = sext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +; -------------------------------------------------------------------------------- +; or-with-constant as add +; -------------------------------------------------------------------------------- + +; Check add-as-or with split 64-bit or. +define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrspace(6) inreg %sbase, i32 %idx) { +; GFX1250-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_bitop2_b32 v0, 16, v0 bitop3:0x54 +; GFX1250-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.idx = zext i32 %idx to i64 + %or = or i64 %zext.idx, 16 + %addr = inttoptr i64 %or to ptr + %load = load i8, ptr %addr + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_or_b32_e32 v0, 0x1040, v0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, 0x1040, v0 +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.idx = zext i32 %idx to i64 + %or = or i64 %zext.idx, 4160 + %addr = inttoptr i64 %or to ptr + %load = load i8, ptr %addr + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; -------------------------------------------------------------------------------- +; Full 64-bit scalar add. +; -------------------------------------------------------------------------------- + +define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) { +; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv: +; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3 +; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 +; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB116_1 +; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv: +; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3 +; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 4 +; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2 +; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB116_1 +; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2 +; GFX1250-GISEL-NEXT: s_endpgm +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ] + %i4 = zext i32 %i to i64 + %i5 = getelementptr inbounds float, ptr %arg, i64 %i4 + %i6 = load volatile float, ptr %i5, align 4 + %i8 = add nuw nsw i32 %i, 1 + %i9 = icmp eq i32 %i8, 256 + br i1 %i9, label %bb2, label %bb3 +} + +; Make sure we only have a single zero vaddr initialization. + +define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inreg %arg.1) { +; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv_multiload: +; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3 +; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 +; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB117_1 +; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv_multiload: +; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3 +; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 4 +; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2 +; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB117_1 +; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2 +; GFX1250-GISEL-NEXT: s_endpgm +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ] + %i4 = zext i32 %i to i64 + %i5 = getelementptr inbounds float, ptr %arg, i64 %i4 + %i6 = load volatile float, ptr %i5, align 4 + %i5.1 = getelementptr inbounds float, ptr %arg.1, i64 %i4 + %i6.1 = load volatile float, ptr %i5, align 4 + %i8 = add nuw nsw i32 %i, 1 + %i9 = icmp eq i32 %i8, 256 + br i1 %i9, label %bb2, label %bb3 +} + +!0 = !{i32 0, i32 1073741824} ; (1 << 30) +!1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1 |
