diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstructions.td')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 221 |
1 files changed, 186 insertions, 35 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index e8b450122673..1f7951258c21 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -66,7 +66,7 @@ defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; // Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 let OtherPredicates = [isNotGFX90APlus] in { -let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in { +let Constraints = "$src0 = $vdst" in { defm V_INTERP_P2_F32 : VINTRP_m < 0x00000001, @@ -77,7 +77,7 @@ defm V_INTERP_P2_F32 : VINTRP_m < [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc, (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; -} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst" +} // End Constraints = "$src0 = $vdst" defm V_INTERP_MOV_F32 : VINTRP_m < 0x00000002, @@ -326,28 +326,57 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>; // clang-format off -defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_"; + multiclass - AMDGPUWaveReducePseudoGenerator<string Op, string DataType> { + AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> { let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { def !toupper(Op) #"_PSEUDO_" #DataType - : VPseudoInstSI<(outs SGPR_32 : $sdst), - (ins VSrc_b32 : $src, VSrc_b32 : $strategy), - [(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {} + : VPseudoInstSI<(outs RetReg : $sdst), + (ins Reg : $src, VSrc_b32 : $strategy), + [(set ty : $sdst, (!cast<AMDGPUWaveReduce>("int_amdgcn_wave_reduce_" #Op) ty : $src, i32 : $strategy))]> {} } } // clang-format on +class WaveReduceOp<string OpName, string TypeStr, ValueType Ty, + RegisterClass ReturnRegisterClass, SrcRegOrImm9 RC> { + string Name = OpName; + string TypeString = TypeStr; + ValueType VT = Ty; + RegisterClass RetReg = ReturnRegisterClass; + SrcRegOrImm9 Reg = RC; +} + // Input list : [Operation_name, -// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)] +// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B), +// bit-width +// output register class, +// input register class] defvar Operations = [ - ["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"], - ["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"], - ["xor", "B32"] + WaveReduceOp<"umin", "U32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"min", "I32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"umax", "U32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"max", "I32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"add", "I32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"sub", "I32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"and", "B32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"or", "B32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"xor", "B32", i32, SGPR_32, VSrc_b32>, + + WaveReduceOp<"umin", "U64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>, ]; foreach Op = Operations in { - defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>; + defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op.Name, Op.TypeString, + Op.VT, Op.RetReg, Op.Reg>; } let usesCustomInserter = 1, Defs = [VCC] in { @@ -692,6 +721,33 @@ def SI_WHOLE_WAVE_FUNC_RETURN : SPseudoInstSI < def : GCNPat< (AMDGPUwhole_wave_return), (SI_WHOLE_WAVE_FUNC_RETURN (i1 (IMPLICIT_DEF)))>; +// Restores the previous EXEC and otherwise behaves entirely like a SI_TCRETURN. +// This is used for tail calls *from* a whole wave function. Tail calls to +// a whole wave function may use the usual opcodes, depending on the calling +// convention of the caller. +def SI_TCRETURN_GFX_WholeWave : SPseudoInstSI < + (outs), + (ins SReg_1:$orig_exec, Gfx_CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff)> { + let isCall = 1; + let isTerminator = 1; + let isReturn = 1; + let isBarrier = 1; + let UseNamedOperandTable = 1; + let SchedRW = [WriteBranch]; + let isConvergent = 1; + + // We're going to use custom handling to set the $orig_exec to the correct value. + let usesCustomInserter = 1; +} + +// Generate a SI_TCRETURN_GFX_WholeWave pseudo with a placeholder for its +// argument. It will be filled in by the custom inserter. +def : GCNPat< + (AMDGPUtc_return_gfx_ww i64:$src0, tglobaladdr:$callee, i32:$fpdiff), + (SI_TCRETURN_GFX_WholeWave (i1 (IMPLICIT_DEF)), Gfx_CCR_SGPR_64:$src0, + tglobaladdr:$callee, i32:$fpdiff)>; + + // Return for returning shaders to a shader variant epilog. def SI_RETURN_TO_EPILOG : SPseudoInstSI < (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { @@ -2174,7 +2230,8 @@ def : GCNPat < } foreach fp16vt = [f16, bf16] in { - +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in { def : GCNPat < (fcopysign fp16vt:$src0, fp16vt:$src1), (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) @@ -2205,6 +2262,42 @@ def : GCNPat < (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) >; +} +let True16Predicate = UseRealTrue16Insts in { +def : GCNPat < + (fcopysign fp16vt:$src0, fp16vt:$src1), + (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16), + (REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16) +>; + +def : GCNPat < + (fcopysign f32:$src0, fp16vt:$src1), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, + (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16)) +>; + +def : GCNPat < + (fcopysign f64:$src0, fp16vt:$src1), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), + (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16)), sub1) +>; + +def : GCNPat < + (fcopysign fp16vt:$src0, f32:$src1), + (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff0000)), + (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src0, hi16), $src1), hi16) +>; + +def : GCNPat < + (fcopysign fp16vt:$src0, f64:$src1), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16), + (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) +>; +} } // End foreach fp16vt = [f16, bf16] @@ -2480,6 +2573,38 @@ def : AMDGPUPatIgnoreCopies < (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) >; +// (z & ~x) +def : AMDGPUPatIgnoreCopies < + (DivergentBinFrag<and> i32:$z, (not_oneuse i32:$x)), + (V_BFI_B32_e64 VSrc_b32:$x, (i32 0), VSrc_b32:$z) +>; + +// 64-bit version +def : AMDGPUPatIgnoreCopies < + (DivergentBinFrag<and> i64:$z, (not_oneuse i64:$x)), + (REG_SEQUENCE VReg_64, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), (i32 0), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), (i32 0), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) +>; + +// (y | ~x) +def : AMDGPUPatIgnoreCopies < + (DivergentBinFrag<or> i32:$y, (not_oneuse i32:$x)), + (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, (i32 -1)) +>; + +// 64-bit version +def : AMDGPUPatIgnoreCopies < + (DivergentBinFrag<or> i64:$y, (not_oneuse i64:$x)), + (REG_SEQUENCE VReg_64, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), (i32 -1)), sub0, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), (i32 -1)), sub1) +>; + // SHA-256 Ch function // z ^ (x & (y ^ z)) def : AMDGPUPatIgnoreCopies < @@ -3096,6 +3221,11 @@ def : GCNPat< (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), (COPY VSrc_b16:$src) >; + +def : GCNPat < + (i1 (DivergentUnaryFrag<trunc> i16:$a)), + (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) +>; } let True16Predicate = UseRealTrue16Insts in { @@ -3106,15 +3236,18 @@ def : GCNPat< def : GCNPat< (i64 (DivergentUnaryFrag<zext> i16:$src)), - (REG_SEQUENCE VReg_64, - (INSERT_SUBREG (i32 (V_MOV_B32_e32 (i32 0))), VGPR_16:$src, lo16), sub0, - (S_MOV_B32 (i32 0)), sub1) + (REG_SEQUENCE VReg_64, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16, (V_MOV_B32_e32 (i32 0)), sub1) >; def : GCNPat< (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16) >; + +def : GCNPat < + (i1 (DivergentUnaryFrag<trunc> i16:$a)), + (V_CMP_EQ_U16_t16_e64 (i32 0), (V_AND_B16_t16_e64 (i32 0), (i16 1), (i32 0), $a), (i32 0), (i16 1), (i32 0)) +>; } def : GCNPat < @@ -3143,11 +3276,6 @@ def : GCNPat < (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) >; -def : GCNPat < - (i1 (DivergentUnaryFrag<trunc> i16:$a)), - (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) ->; - def IMMBitSelConst : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N), MVT::i32); @@ -3637,13 +3765,24 @@ def : GCNPat < >; foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in -let True16Predicate = p in +let True16Predicate = p in { // Take the lower 16 bits from each VGPR_32 and concat them def : GCNPat < (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))), (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100))) >; +// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] +// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000) +def : GCNPat < + (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), + (Ty !if(!eq(Ty, i16), + (Ty (trunc (srl VGPR_32:$b, (i32 16)))), + (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b) +>; +} + let True16Predicate = UseRealTrue16Insts in { def : GCNPat < (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))), @@ -3669,18 +3808,6 @@ def : GCNPat < (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b) >; - -// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] -// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000) -def : GCNPat < - (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), - (Ty !if(!eq(Ty, i16), - (Ty (trunc (srl VGPR_32:$b, (i32 16)))), - (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), - (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b) ->; - - // Take the upper 16 bits from V[0] and the lower 16 bits from V[1] // Special case, can use V_ALIGNBIT (always uses encoded literal) let True16Predicate = NotHasTrue16BitInsts in { @@ -3752,7 +3879,8 @@ def : GCNPat < (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1)) >; - +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in { def : GCNPat < (v2f16 (scalar_to_vector f16:$src0)), (COPY $src0) @@ -3772,6 +3900,29 @@ def : GCNPat < (v4f16 (scalar_to_vector f16:$src0)), (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) >; +} + +let True16Predicate = UseRealTrue16Insts in { +def : GCNPat < + (v2f16 (scalar_to_vector f16:$src0)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16) +>; + +def : GCNPat < + (v2i16 (scalar_to_vector i16:$src0)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16) +>; + +def : GCNPat < + (v4i16 (scalar_to_vector i16:$src0)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1) +>; + +def : GCNPat < + (v4f16 (scalar_to_vector f16:$src0)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1) +>; +} def : GCNPat < (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, |
