summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIInstructions.td
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstructions.td')
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td221
1 files changed, 186 insertions, 35 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e8b450122673..1f7951258c21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -66,7 +66,7 @@ defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
// Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
let OtherPredicates = [isNotGFX90APlus] in {
-let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
+let Constraints = "$src0 = $vdst" in {
defm V_INTERP_P2_F32 : VINTRP_m <
0x00000001,
@@ -77,7 +77,7 @@ defm V_INTERP_P2_F32 : VINTRP_m <
[(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc,
(i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
-} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
+} // End Constraints = "$src0 = $vdst"
defm V_INTERP_MOV_F32 : VINTRP_m <
0x00000002,
@@ -326,28 +326,57 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
// clang-format off
-defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
+
multiclass
- AMDGPUWaveReducePseudoGenerator<string Op, string DataType> {
+ AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> {
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def !toupper(Op) #"_PSEUDO_" #DataType
- : VPseudoInstSI<(outs SGPR_32 : $sdst),
- (ins VSrc_b32 : $src, VSrc_b32 : $strategy),
- [(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
+ : VPseudoInstSI<(outs RetReg : $sdst),
+ (ins Reg : $src, VSrc_b32 : $strategy),
+ [(set ty : $sdst, (!cast<AMDGPUWaveReduce>("int_amdgcn_wave_reduce_" #Op) ty : $src, i32 : $strategy))]> {}
}
}
// clang-format on
+class WaveReduceOp<string OpName, string TypeStr, ValueType Ty,
+ RegisterClass ReturnRegisterClass, SrcRegOrImm9 RC> {
+ string Name = OpName;
+ string TypeString = TypeStr;
+ ValueType VT = Ty;
+ RegisterClass RetReg = ReturnRegisterClass;
+ SrcRegOrImm9 Reg = RC;
+}
+
// Input list : [Operation_name,
-// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)]
+// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
+// bit-width
+// output register class,
+// input register class]
defvar Operations = [
- ["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"],
- ["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"],
- ["xor", "B32"]
+ WaveReduceOp<"umin", "U32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"min", "I32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"umax", "U32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"max", "I32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"add", "I32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"sub", "I32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"and", "B32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"or", "B32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"xor", "B32", i32, SGPR_32, VSrc_b32>,
+
+ WaveReduceOp<"umin", "U64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>,
];
foreach Op = Operations in {
- defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>;
+ defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op.Name, Op.TypeString,
+ Op.VT, Op.RetReg, Op.Reg>;
}
let usesCustomInserter = 1, Defs = [VCC] in {
@@ -692,6 +721,33 @@ def SI_WHOLE_WAVE_FUNC_RETURN : SPseudoInstSI <
def : GCNPat<
(AMDGPUwhole_wave_return), (SI_WHOLE_WAVE_FUNC_RETURN (i1 (IMPLICIT_DEF)))>;
+// Restores the previous EXEC and otherwise behaves entirely like a SI_TCRETURN.
+// This is used for tail calls *from* a whole wave function. Tail calls to
+// a whole wave function may use the usual opcodes, depending on the calling
+// convention of the caller.
+def SI_TCRETURN_GFX_WholeWave : SPseudoInstSI <
+ (outs),
+ (ins SReg_1:$orig_exec, Gfx_CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff)> {
+ let isCall = 1;
+ let isTerminator = 1;
+ let isReturn = 1;
+ let isBarrier = 1;
+ let UseNamedOperandTable = 1;
+ let SchedRW = [WriteBranch];
+ let isConvergent = 1;
+
+ // We're going to use custom handling to set the $orig_exec to the correct value.
+ let usesCustomInserter = 1;
+}
+
+// Generate a SI_TCRETURN_GFX_WholeWave pseudo with a placeholder for its
+// argument. It will be filled in by the custom inserter.
+def : GCNPat<
+ (AMDGPUtc_return_gfx_ww i64:$src0, tglobaladdr:$callee, i32:$fpdiff),
+ (SI_TCRETURN_GFX_WholeWave (i1 (IMPLICIT_DEF)), Gfx_CCR_SGPR_64:$src0,
+ tglobaladdr:$callee, i32:$fpdiff)>;
+
+
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
@@ -2174,7 +2230,8 @@ def : GCNPat <
}
foreach fp16vt = [f16, bf16] in {
-
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(fcopysign fp16vt:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -2205,6 +2262,42 @@ def : GCNPat <
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
>;
+}
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (fcopysign fp16vt:$src0, fp16vt:$src1),
+ (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16),
+ (REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16)
+>;
+
+def : GCNPat <
+ (fcopysign f32:$src0, fp16vt:$src1),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+ (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16))
+>;
+
+def : GCNPat <
+ (fcopysign f64:$src0, fp16vt:$src1),
+ (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
+ (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16)), sub1)
+>;
+
+def : GCNPat <
+ (fcopysign fp16vt:$src0, f32:$src1),
+ (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff0000)),
+ (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src0, hi16), $src1), hi16)
+>;
+
+def : GCNPat <
+ (fcopysign fp16vt:$src0, f64:$src1),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16),
+ (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
+>;
+}
} // End foreach fp16vt = [f16, bf16]
@@ -2480,6 +2573,38 @@ def : AMDGPUPatIgnoreCopies <
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
>;
+// (z & ~x)
+def : AMDGPUPatIgnoreCopies <
+ (DivergentBinFrag<and> i32:$z, (not_oneuse i32:$x)),
+ (V_BFI_B32_e64 VSrc_b32:$x, (i32 0), VSrc_b32:$z)
+>;
+
+// 64-bit version
+def : AMDGPUPatIgnoreCopies <
+ (DivergentBinFrag<and> i64:$z, (not_oneuse i64:$x)),
+ (REG_SEQUENCE VReg_64,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), (i32 0),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), (i32 0),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
+>;
+
+// (y | ~x)
+def : AMDGPUPatIgnoreCopies <
+ (DivergentBinFrag<or> i32:$y, (not_oneuse i32:$x)),
+ (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, (i32 -1))
+>;
+
+// 64-bit version
+def : AMDGPUPatIgnoreCopies <
+ (DivergentBinFrag<or> i64:$y, (not_oneuse i64:$x)),
+ (REG_SEQUENCE VReg_64,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), (i32 -1)), sub0,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), (i32 -1)), sub1)
+>;
+
// SHA-256 Ch function
// z ^ (x & (y ^ z))
def : AMDGPUPatIgnoreCopies <
@@ -3096,6 +3221,11 @@ def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(COPY VSrc_b16:$src)
>;
+
+def : GCNPat <
+ (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+ (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
+>;
}
let True16Predicate = UseRealTrue16Insts in {
@@ -3106,15 +3236,18 @@ def : GCNPat<
def : GCNPat<
(i64 (DivergentUnaryFrag<zext> i16:$src)),
- (REG_SEQUENCE VReg_64,
- (INSERT_SUBREG (i32 (V_MOV_B32_e32 (i32 0))), VGPR_16:$src, lo16), sub0,
- (S_MOV_B32 (i32 0)), sub1)
+ (REG_SEQUENCE VReg_64, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16, (V_MOV_B32_e32 (i32 0)), sub1)
>;
def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16)
>;
+
+def : GCNPat <
+ (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+ (V_CMP_EQ_U16_t16_e64 (i32 0), (V_AND_B16_t16_e64 (i32 0), (i16 1), (i32 0), $a), (i32 0), (i16 1), (i32 0))
+>;
}
def : GCNPat <
@@ -3143,11 +3276,6 @@ def : GCNPat <
(V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
>;
-def : GCNPat <
- (i1 (DivergentUnaryFrag<trunc> i16:$a)),
- (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
->;
-
def IMMBitSelConst : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N),
MVT::i32);
@@ -3637,13 +3765,24 @@ def : GCNPat <
>;
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in
+let True16Predicate = p in {
// Take the lower 16 bits from each VGPR_32 and concat them
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
>;
+// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
+// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
+def : GCNPat <
+ (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
+ (Ty !if(!eq(Ty, i16),
+ (Ty (trunc (srl VGPR_32:$b, (i32 16)))),
+ (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b)
+>;
+}
+
let True16Predicate = UseRealTrue16Insts in {
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
@@ -3669,18 +3808,6 @@ def : GCNPat <
(V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b)
>;
-
-// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
-// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
-def : GCNPat <
- (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
- (Ty !if(!eq(Ty, i16),
- (Ty (trunc (srl VGPR_32:$b, (i32 16)))),
- (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
- (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b)
->;
-
-
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
// Special case, can use V_ALIGNBIT (always uses encoded literal)
let True16Predicate = NotHasTrue16BitInsts in {
@@ -3752,7 +3879,8 @@ def : GCNPat <
(v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1))
>;
-
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(v2f16 (scalar_to_vector f16:$src0)),
(COPY $src0)
@@ -3772,6 +3900,29 @@ def : GCNPat <
(v4f16 (scalar_to_vector f16:$src0)),
(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (v2f16 (scalar_to_vector f16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+ (v2i16 (scalar_to_vector i16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+ (v4i16 (scalar_to_vector i16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
+def : GCNPat <
+ (v4f16 (scalar_to_vector f16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+}
def : GCNPat <
(i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,