diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
26 files changed, 448 insertions, 290 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index e844904ebd1e..0f9798838d7f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -1523,7 +1523,8 @@ Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder, bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv; bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem; - int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned); + unsigned BitWidth = Num->getType()->getScalarSizeInBits(); + int NumDivBits = getDivNumBits(I, Num, Den, BitWidth - 32, IsSigned); if (NumDivBits == -1) return nullptr; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 985fa8f1deff..da47aaf8a3b5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -124,6 +124,16 @@ def sign_extension_in_reg : GICombineRule< [{ return matchCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }]), (apply [{ applyCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }])>; +// Do the following combines : +// fmul x, select(y, A, B) -> fldexp (x, select i32 (y, a, b)) +// fmul x, select(y, -A, -B) -> fldexp ((fneg x), select i32 (y, a, b)) +def combine_fmul_with_select_to_fldexp : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (G_FMUL $dst, $x, $select):$root, + (G_SELECT $select, $y, $A, $B):$sel, + [{ return Helper.matchCombineFmulWithSelectToFldexp(*${root}, *${sel}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; + let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This @@ -153,13 +163,13 @@ def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>; def AMDGPUPreLegalizerCombiner: GICombiner< "AMDGPUPreLegalizerCombinerImpl", - [all_combines, clamp_i64_to_i16, foldable_fneg]> { + [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16, foldable_fneg]> { let CombineAllMethodName = "tryCombineAllImpl"; } def AMDGPUPostLegalizerCombiner: GICombiner< "AMDGPUPostLegalizerCombinerImpl", - [all_combines, gfx6gfx7_combines, gfx8_combines, + [all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> { let CombineAllMethodName = "tryCombineAllImpl"; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp index e5a376ab7357..f6f9f4bc0fb1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp @@ -17,6 +17,13 @@ using namespace llvm; using namespace MIPatternMatch; +AMDGPUCombinerHelper::AMDGPUCombinerHelper( + GISelChangeObserver &Observer, MachineIRBuilder &B, bool IsPreLegalize, + GISelKnownBits *KB, MachineDominatorTree *MDT, const LegalizerInfo *LI, + const GCNSubtarget &STI) + : CombinerHelper(Observer, B, IsPreLegalize, KB, MDT, LI), STI(STI), + TII(*STI.getInstrInfo()) {} + LLVM_READNONE static bool fnegFoldsIntoMI(const MachineInstr &MI) { switch (MI.getOpcode()) { @@ -445,3 +452,67 @@ void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI, Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1); MI.eraseFromParent(); } + +bool AMDGPUCombinerHelper::matchCombineFmulWithSelectToFldexp( + MachineInstr &MI, MachineInstr &Sel, + std::function<void(MachineIRBuilder &)> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FMUL); + assert(Sel.getOpcode() == TargetOpcode::G_SELECT); + assert(MI.getOperand(2).getReg() == Sel.getOperand(0).getReg()); + + Register Dst = MI.getOperand(0).getReg(); + LLT DestTy = MRI.getType(Dst); + LLT ScalarDestTy = DestTy.getScalarType(); + + if ((ScalarDestTy != LLT::float64() && ScalarDestTy != LLT::float32() && + ScalarDestTy != LLT::float16()) || + !MRI.hasOneNonDBGUse(Sel.getOperand(0).getReg())) + return false; + + Register SelectCondReg = Sel.getOperand(1).getReg(); + MachineInstr *SelectTrue = MRI.getVRegDef(Sel.getOperand(2).getReg()); + MachineInstr *SelectFalse = MRI.getVRegDef(Sel.getOperand(3).getReg()); + + const auto SelectTrueVal = + isConstantOrConstantSplatVectorFP(*SelectTrue, MRI); + if (!SelectTrueVal) + return false; + const auto SelectFalseVal = + isConstantOrConstantSplatVectorFP(*SelectFalse, MRI); + if (!SelectFalseVal) + return false; + + if (SelectTrueVal->isNegative() != SelectFalseVal->isNegative()) + return false; + + // For f32, only non-inline constants should be transformed. + if (ScalarDestTy == LLT::float32() && TII.isInlineConstant(*SelectTrueVal) && + TII.isInlineConstant(*SelectFalseVal)) + return false; + + int SelectTrueLog2Val = SelectTrueVal->getExactLog2Abs(); + if (SelectTrueLog2Val == INT_MIN) + return false; + int SelectFalseLog2Val = SelectFalseVal->getExactLog2Abs(); + if (SelectFalseLog2Val == INT_MIN) + return false; + + MatchInfo = [=, &MI](MachineIRBuilder &Builder) { + LLT IntDestTy = DestTy.changeElementType(LLT::scalar(32)); + auto NewSel = Builder.buildSelect( + IntDestTy, SelectCondReg, + Builder.buildConstant(IntDestTy, SelectTrueLog2Val), + Builder.buildConstant(IntDestTy, SelectFalseLog2Val)); + + Register XReg = MI.getOperand(1).getReg(); + if (SelectTrueVal->isNegative()) { + auto NegX = + Builder.buildFNeg(DestTy, XReg, MRI.getVRegDef(XReg)->getFlags()); + Builder.buildFLdexp(Dst, NegX, NewSel, MI.getFlags()); + } else { + Builder.buildFLdexp(Dst, XReg, NewSel, MI.getFlags()); + } + }; + + return true; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h index 6510abe9d232..893b3f5415f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h @@ -15,13 +15,22 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOMBINERHELPER_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOMBINERHELPER_H +#include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" namespace llvm { class AMDGPUCombinerHelper : public CombinerHelper { +protected: + const GCNSubtarget &STI; + const SIInstrInfo &TII; + public: using CombinerHelper::CombinerHelper; + AMDGPUCombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B, + bool IsPreLegalize, GISelKnownBits *KB, + MachineDominatorTree *MDT, const LegalizerInfo *LI, + const GCNSubtarget &STI); bool matchFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo); void applyFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo); @@ -30,6 +39,10 @@ public: Register Src1, Register Src2); void applyExpandPromotedF16FMed3(MachineInstr &MI, Register Src0, Register Src1, Register Src2); + + bool matchCombineFmulWithSelectToFldexp( + MachineInstr &MI, MachineInstr &Sel, + std::function<void(MachineIRBuilder &)> &MatchInfo); }; } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index d9eaf82c5214..27e9018d68a0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1997,7 +1997,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) return false; SAddr = SelectSAddrFI(CurDAG, SAddr); - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); + Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3be865f03df1..041b9b4d66f6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1125,8 +1125,9 @@ static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, unsigned FakeS16Opc, unsigned S32Opc, unsigned S64Opc) { if (Size == 16) + // FIXME-TRUE16 use TrueS16Opc when realtrue16 is supported for CMP code return ST.hasTrue16BitInsts() - ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc + ? ST.useRealTrue16Insts() ? FakeS16Opc : FakeS16Opc : S16Opc; if (Size == 32) return S32Opc; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index 54d927c33fc5..888817e52e35 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -134,7 +134,7 @@ AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl( const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI), TII(*STI.getInstrInfo()), - Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI), + Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI, STI), #define GET_GICOMBINER_CONSTRUCTOR_INITS #include "AMDGPUGenPostLegalizeGICombiner.inc" #undef GET_GICOMBINER_CONSTRUCTOR_INITS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index ff8189ce31f7..e1564d5de415 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -94,7 +94,7 @@ AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl( const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI), - Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI), + Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI, STI), #define GET_GICOMBINER_CONSTRUCTOR_INITS #include "AMDGPUGenPreLegalizeGICombiner.inc" #undef GET_GICOMBINER_CONSTRUCTOR_INITS diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index d94c400ad142..08e23cbf34e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1190,9 +1190,13 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); - // TODO: Need to emit a wave reduction to get the maximum size. - if (SizeBank != &AMDGPU::SGPRRegBank) - return false; + if (SizeBank != &AMDGPU::SGPRRegBank) { + auto WaveReduction = + B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, {LLT::scalar(32)}) + .addUse(AllocSize) + .addImm(0); + AllocSize = WaveReduction.getReg(0); + } LLT PtrTy = MRI.getType(Dst); LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index ed956a1f755c..d8f441d1ccfe 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -9760,10 +9760,14 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, case MCK_SReg_64: case MCK_SReg_64_XEXEC: // Null is defined as a 32-bit register but - // it should also be enabled with 64-bit operands. - // The following code enables it for SReg_64 operands + // it should also be enabled with 64-bit operands or larger. + // The following code enables it for SReg_64 and larger operands // used as source and destination. Remaining source // operands are handled in isInlinableImm. + case MCK_SReg_96: + case MCK_SReg_128: + case MCK_SReg_256: + case MCK_SReg_512: return Operand.isNull() ? Match_Success : Match_InvalidOperand; default: return Match_InvalidOperand; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index a351f451584f..f2686bdf56b4 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -168,7 +168,7 @@ class getMTBUFInsDA<list<RegisterClass> vdataList, dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); - dag NonVaddrInputs = !con((ins SReg_128:$srsrc), SOffset, + dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, FORMAT:$format, CPol_0:$cpol, i1imm_0:$swz)); dag Inputs = !if(!empty(vaddrList), @@ -418,7 +418,7 @@ class getMUBUFInsDA<list<RegisterClass> vdataList, RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdataClass, isTFE>.ret; dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); - dag NonVaddrInputs = !con((ins SReg_128:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz)); + dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz)); dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs)); dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs)); @@ -680,7 +680,7 @@ multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32> { class MUBUF_Pseudo_Store_Lds<string opName> : MUBUF_Pseudo<opName, (outs), - (ins SReg_128:$srsrc, SCSrc_b32:$soffset, Offset:$offset, CPol:$cpol, i1imm:$swz), + (ins SReg_128_XNULL:$srsrc, SCSrc_b32:$soffset, Offset:$offset, CPol:$cpol, i1imm:$swz), " $srsrc, $soffset$offset lds$cpol"> { let LGKM_CNT = 1; let mayLoad = 1; @@ -703,7 +703,7 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, bit hasRestric dag VData = !if(vdata_in, (ins vdata_op:$vdata_in), (ins vdata_op:$vdata)); dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddrClass:$vaddr))); dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); - dag MainInputs = !con((ins SReg_128:$srsrc), SOffset, (ins Offset:$offset)); + dag MainInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset)); dag CPol = !if(vdata_in, (ins CPol_GLC_WithDefault:$cpol), (ins CPol_NonGLC_WithDefault:$cpol)); diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 590835180572..d2363274965a 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -279,7 +279,9 @@ DECODE_OPERAND_REG_7(SReg_64_XEXEC, OPW64) DECODE_OPERAND_REG_7(SReg_64_XEXEC_XNULL, OPW64) DECODE_OPERAND_REG_7(SReg_96, OPW96) DECODE_OPERAND_REG_7(SReg_128, OPW128) +DECODE_OPERAND_REG_7(SReg_128_XNULL, OPW128) DECODE_OPERAND_REG_7(SReg_256, OPW256) +DECODE_OPERAND_REG_7(SReg_256_XNULL, OPW256) DECODE_OPERAND_REG_7(SReg_512, OPW512) DECODE_OPERAND_REG_8(AGPR_32) @@ -1692,6 +1694,11 @@ AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width, unsigned Val, case OPW64: case OPWV232: return decodeSpecialReg64(Val); + case OPW96: + case OPW128: + case OPW256: + case OPW512: + return decodeSpecialReg96Plus(Val); default: llvm_unreachable("unexpected immediate type"); } @@ -1778,6 +1785,24 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { return errOperand(Val, "unknown operand encoding " + Twine(Val)); } +MCOperand AMDGPUDisassembler::decodeSpecialReg96Plus(unsigned Val) const { + using namespace AMDGPU; + + switch (Val) { + case 124: + if (isGFX11Plus()) + return createRegOperand(SGPR_NULL); + break; + case 125: + if (!isGFX11Plus()) + return createRegOperand(SGPR_NULL); + break; + default: + break; + } + return errOperand(Val, "unknown operand encoding " + Twine(Val)); +} + MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, const unsigned Val, unsigned ImmWidth, diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index b19e4b74a394..9a06cc3dc8c7 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -259,6 +259,7 @@ public: MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const; MCOperand decodeSpecialReg32(unsigned Val) const; MCOperand decodeSpecialReg64(unsigned Val) const; + MCOperand decodeSpecialReg96Plus(unsigned Val) const; MCOperand decodeSDWASrc(const OpWidthTy Width, unsigned Val, unsigned ImmWidth, diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 4722d3362f6a..1b94d6c43392 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -422,7 +422,7 @@ class MIMG_NoSampler_Helper <mimgopc op, string asm, RegisterClass addr_rc, string dns=""> : MIMG_gfx6789 <op.GFX10M, (outs dst_rc:$vdata), dns> { - let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, + let InOperandList = !con((ins addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -435,7 +435,7 @@ class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm, RegisterClass addr_rc, string dns=""> : MIMG_gfx90a <op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> { - let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, + let InOperandList = !con((ins addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -447,7 +447,7 @@ class MIMG_NoSampler_gfx10<mimgopc op, string opcode, RegisterClass DataRC, RegisterClass AddrRC, string dns=""> : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> { - let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -460,7 +460,7 @@ class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode, string dns=""> : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> { let InOperandList = !con(AddrIns, - (ins SReg_256:$srsrc, DMask:$dmask, + (ins SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -472,7 +472,7 @@ class MIMG_NoSampler_gfx11<mimgopc op, string opcode, RegisterClass DataRC, RegisterClass AddrRC, string dns=""> : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> { - let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -485,7 +485,7 @@ class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode, string dns=""> : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> { let InOperandList = !con(AddrIns, - (ins SReg_256:$srsrc, DMask:$dmask, + (ins SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -498,7 +498,7 @@ class VIMAGE_NoSampler_gfx12<mimgopc op, string opcode, string dns=""> : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns> { let InOperandList = !con(AddrIns, - (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim, + (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe" @@ -510,8 +510,8 @@ class VSAMPLE_Sampler_gfx12<mimgopc op, string opcode, RegisterClass DataRC, string dns=""> : VSAMPLE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns, Addr3RC> { let InOperandList = !con(AddrIns, - (ins SReg_256:$rsrc), - !if(BaseOpcode.Sampler, (ins SReg_128:$samp), (ins)), + (ins SReg_256_XNULL:$rsrc), + !if(BaseOpcode.Sampler, (ins SReg_128_XNULL:$samp), (ins)), (ins DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), @@ -527,8 +527,8 @@ class VSAMPLE_Sampler_nortn_gfx12<mimgopc op, string opcode, string dns=""> : VSAMPLE_gfx12<op.GFX12, (outs), num_addrs, dns, Addr3RC> { let InOperandList = !con(AddrIns, - (ins SReg_256:$rsrc), - !if(BaseOpcode.Sampler, (ins SReg_128:$samp), (ins)), + (ins SReg_256_XNULL:$rsrc), + !if(BaseOpcode.Sampler, (ins SReg_128_XNULL:$samp), (ins)), (ins DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), @@ -679,7 +679,7 @@ class MIMG_Store_Helper <mimgopc op, string asm, RegisterClass addr_rc, string dns = ""> : MIMG_gfx6789<op.GFX10M, (outs), dns> { - let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, + let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -693,7 +693,7 @@ class MIMG_Store_Helper_gfx90a <mimgopc op, string asm, string dns = ""> : MIMG_gfx90a<op.GFX10M, (outs), dns> { let InOperandList = !con((ins getLdStRegisterOperand<data_rc>.ret:$vdata, - addr_rc:$vaddr, SReg_256:$srsrc, + addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -705,7 +705,7 @@ class MIMG_Store_gfx10<mimgopc op, string opcode, RegisterClass DataRC, RegisterClass AddrRC, string dns=""> : MIMG_gfx10<op.GFX10M, (outs), dns> { - let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -719,7 +719,7 @@ class MIMG_Store_nsa_gfx10<mimgopc op, string opcode, : MIMG_nsa_gfx10<op.GFX10M, (outs), num_addrs, dns> { let InOperandList = !con((ins DataRC:$vdata), AddrIns, - (ins SReg_256:$srsrc, DMask:$dmask, + (ins SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -731,7 +731,7 @@ class MIMG_Store_gfx11<mimgopc op, string opcode, RegisterClass DataRC, RegisterClass AddrRC, string dns=""> : MIMG_gfx11<op.GFX11, (outs), dns> { - let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -745,7 +745,7 @@ class MIMG_Store_nsa_gfx11<mimgopc op, string opcode, : MIMG_nsa_gfx11<op.GFX11, (outs), num_addrs, dns> { let InOperandList = !con((ins DataRC:$vdata), AddrIns, - (ins SReg_256:$srsrc, DMask:$dmask, + (ins SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -759,7 +759,7 @@ class VIMAGE_Store_gfx12<mimgopc op, string opcode, : VIMAGE_gfx12<op.GFX12, (outs), num_addrs, dns> { let InOperandList = !con((ins DataRC:$vdata), AddrIns, - (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim, + (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe" @@ -875,7 +875,7 @@ class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc, : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> { let Constraints = "$vdst = $vdata"; - let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, + let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da); let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; @@ -887,7 +887,7 @@ class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterClass data_rc, let Constraints = "$vdst = $vdata"; let InOperandList = (ins getLdStRegisterOperand<data_rc>.ret:$vdata, - addr_rc:$vaddr, SReg_256:$srsrc, + addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da); let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; @@ -921,7 +921,7 @@ class MIMG_Atomic_gfx10<mimgopc op, string opcode, !if(enableDisasm, "GFX10", "")> { let Constraints = "$vdst = $vdata"; - let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; @@ -936,7 +936,7 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, let InOperandList = !con((ins DataRC:$vdata), AddrIns, - (ins SReg_256:$srsrc, DMask:$dmask, + (ins SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe)); let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; @@ -949,7 +949,7 @@ class MIMG_Atomic_gfx11<mimgopc op, string opcode, !if(enableDisasm, "GFX11", "")> { let Constraints = "$vdst = $vdata"; - let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; @@ -964,7 +964,7 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, let InOperandList = !con((ins DataRC:$vdata), AddrIns, - (ins SReg_256:$srsrc, DMask:$dmask, + (ins SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe)); let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; @@ -978,7 +978,7 @@ class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterClass DataRC, let InOperandList = !con((ins DataRC:$vdata), AddrIns, - (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim, + (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe)); let AsmString = !if(!empty(renamed), opcode, renamed)#" $vdata, "#AddrAsm# ", $rsrc$dmask$dim$cpol$r128$a16$tfe"; @@ -1128,7 +1128,7 @@ multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed, class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc, RegisterClass src_rc, string dns=""> : MIMG_gfx6789 <op.VI, (outs dst_rc:$vdata), dns> { - let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -1139,7 +1139,7 @@ class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc, class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc, RegisterClass src_rc, string dns=""> : MIMG_gfx90a<op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> { - let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -1149,7 +1149,7 @@ class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc, class MIMG_Sampler_OpList_gfx10p<dag OpPrefix, bit HasD16> { dag ret = !con(OpPrefix, - (ins SReg_256:$srsrc, SReg_128:$ssamp, + (ins SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(HasD16, (ins D16:$d16), (ins))); @@ -1524,7 +1524,7 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16> { class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC> : MIMG_gfx10<op.GFX10M, (outs VReg_128:$vdata), "GFX10"> { - let InOperandList = (ins AddrRC:$vaddr0, SReg_128:$srsrc, A16:$a16); + let InOperandList = (ins AddrRC:$vaddr0, SReg_128_XNULL:$srsrc, A16:$a16); let AsmString = opcode#" $vdata, $vaddr0, $srsrc$a16"; let nsa = 0; @@ -1532,13 +1532,13 @@ class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC> class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs> : MIMG_nsa_gfx10<op.GFX10M, (outs VReg_128:$vdata), num_addrs, "GFX10"> { - let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$srsrc, A16:$a16)); + let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16)); let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16"; } class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterClass AddrRC> : MIMG_gfx11<op.GFX11, (outs VReg_128:$vdata), "GFX11"> { - let InOperandList = (ins AddrRC:$vaddr0, SReg_128:$srsrc, A16:$a16); + let InOperandList = (ins AddrRC:$vaddr0, SReg_128_XNULL:$srsrc, A16:$a16); let AsmString = opcode#" $vdata, $vaddr0, $srsrc$a16"; let nsa = 0; @@ -1548,7 +1548,7 @@ class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs, list<RegisterClass> addr_types> : MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "GFX11", addr_types> { - let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$srsrc, A16:$a16)); + let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16)); let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16"; } @@ -1556,7 +1556,7 @@ class VIMAGE_IntersectRay_gfx12<mimgopc op, string opcode, int num_addrs, list<RegisterClass> addr_types> : VIMAGE_gfx12<op.GFX12, (outs VReg_128:$vdata), num_addrs, "GFX12", addr_types> { - let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$rsrc, A16:$a16)); + let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$rsrc, A16:$a16)); let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $rsrc$a16"; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 4fb5cb066be7..2bc19137b1ca 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -199,7 +199,7 @@ static unsigned macToMad(unsigned Opc) { case AMDGPU::V_FMAC_F16_e64: return AMDGPU::V_FMA_F16_gfx9_e64; case AMDGPU::V_FMAC_F16_fake16_e64: - return AMDGPU::V_FMA_F16_gfx9_e64; + return AMDGPU::V_FMA_F16_gfx9_fake16_e64; case AMDGPU::V_FMAC_LEGACY_F32_e64: return AMDGPU::V_FMA_LEGACY_F32_e64; case AMDGPU::V_FMAC_F64_e64: @@ -1096,21 +1096,8 @@ void SIFoldOperandsImpl::foldOperand( B.addImm(Defs[I].second); } LLVM_DEBUG(dbgs() << "Folded " << *UseMI); - return; } - if (Size != 4) - return; - - Register Reg0 = UseMI->getOperand(0).getReg(); - Register Reg1 = UseMI->getOperand(1).getReg(); - if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1)) - UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64)); - else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1)) - UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64)); - else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) && - TRI->isAGPR(*MRI, Reg1)) - UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32)); return; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 58b061f5c1af..0ac84f4e1f02 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4017,29 +4017,26 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } // This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC, -// except for stack growth direction(default: downwards, AMDGPU: upwards) and -// applying the wave size scale to the increment amount. -SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op, - SelectionDAG &DAG) const { +// except for: +// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and +// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size +SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { const MachineFunction &MF = DAG.getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); SDLoc dl(Op); EVT VT = Op.getValueType(); - SDValue Tmp1 = Op; - SDValue Tmp2 = Op.getValue(1); - SDValue Tmp3 = Op.getOperand(2); - SDValue Chain = Tmp1.getOperand(0); - + SDValue Chain = Op.getOperand(0); Register SPReg = Info->getStackPtrOffsetReg(); // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); - SDValue Size = Tmp2.getOperand(1); + SDValue Size = Op.getOperand(1); SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT); - Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue(); + Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue(); const TargetFrameLowering *TFL = Subtarget->getFrameLowering(); assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp && @@ -4057,30 +4054,36 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op, DAG.getSignedConstant(-ScaledAlignment, dl, VT)); } - SDValue ScaledSize = DAG.getNode( - ISD::SHL, dl, VT, Size, - DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); - - SDValue NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value + assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit"); + SDValue NewSP; + if (isa<ConstantSDNode>(Size)) { + // For constant sized alloca, scale alloca size by wave-size + SDValue ScaledSize = DAG.getNode( + ISD::SHL, dl, VT, Size, + DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); + NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value + } else { + // For dynamic sized alloca, perform wave-wide reduction to get max of + // alloca size(divergent) and then scale it by wave-size + SDValue WaveReduction = + DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32); + Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction, + Size, DAG.getConstant(0, dl, MVT::i32)); + SDValue ScaledSize = DAG.getNode( + ISD::SHL, dl, VT, Size, + DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); + NewSP = + DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr. + SDValue ReadFirstLaneID = + DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32); + NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID, + NewSP); + } Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain - Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); + SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); - return DAG.getMergeValues({BaseAddr, Tmp2}, dl); -} - -SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, - SelectionDAG &DAG) const { - // We only handle constant sizes here to allow non-entry block, static sized - // allocas. A truly dynamic value is more difficult to support because we - // don't know if the size value is uniform or not. If the size isn't uniform, - // we would need to do a wave reduction to get the maximum size to know how - // much to increment the uniform stack pointer. - SDValue Size = Op.getOperand(1); - if (isa<ConstantSDNode>(Size)) - return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion. - - return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG); + return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl); } SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const { @@ -13982,6 +13985,43 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, return Accum; } +SDValue +SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue RHS = N->getOperand(1); + auto *CRHS = dyn_cast<ConstantSDNode>(RHS); + if (!CRHS) + return SDValue(); + + // TODO: Worth using computeKnownBits? Maybe expensive since it's so + // common. + uint64_t Val = CRHS->getZExtValue(); + if (countr_zero(Val) >= 32) { + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + SDValue LHS = N->getOperand(0); + + // Avoid carry machinery if we know the low half of the add does not + // contribute to the final result. + // + // add i64:x, K if computeTrailingZeros(K) >= 32 + // => build_pair (add x.hi, K.hi), x.lo + + // Breaking the 64-bit add here with this strange constant is unlikely + // to interfere with addressing mode patterns. + + SDValue Hi = getHiHalf64(LHS, DAG); + SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); + SDValue AddHi = + DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); + + SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); + } + + return SDValue(); +} + // Collect the ultimate src of each of the mul node's operands, and confirm // each operand is 8 bytes. static std::optional<ByteProvider<SDValue>> @@ -14258,6 +14298,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return V; } + if (VT == MVT::i64) { + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) + return Folded; + } + if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() && (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) { SDValue TempNode(N, 0); @@ -14443,6 +14488,11 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); + if (VT == MVT::i64) { + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) + return Folded; + } + if (VT != MVT::i32) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 631f26542bbe..299c8f5f7392 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -212,6 +212,9 @@ private: unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue foldAddSub64WithZeroLowBitsTo32(SDNode *N, + DAGCombinerInfo &DCI) const; + SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -421,7 +424,6 @@ public: SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; - SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f97ea40caa67..e6f333fbb878 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3805,6 +3805,36 @@ static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, } } +static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) { + switch (Opc) { + case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_MAC_F16_e64: + return AMDGPU::V_MAD_F16_e64; + case AMDGPU::V_MAC_F32_e32: + case AMDGPU::V_MAC_F32_e64: + return AMDGPU::V_MAD_F32_e64; + case AMDGPU::V_MAC_LEGACY_F32_e32: + case AMDGPU::V_MAC_LEGACY_F32_e64: + return AMDGPU::V_MAD_LEGACY_F32_e64; + case AMDGPU::V_FMAC_LEGACY_F32_e32: + case AMDGPU::V_FMAC_LEGACY_F32_e64: + return AMDGPU::V_FMA_LEGACY_F32_e64; + case AMDGPU::V_FMAC_F16_e32: + case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_fake16_e64: + return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64 + : AMDGPU::V_FMA_F16_gfx9_e64; + case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_FMAC_F32_e64: + return AMDGPU::V_FMA_F32_e64; + case AMDGPU::V_FMAC_F64_e32: + case AMDGPU::V_FMAC_F64_e64: + return AMDGPU::V_FMA_F64_e64; + default: + llvm_unreachable("invalid instruction"); + } +} + MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const { @@ -4040,14 +4070,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, if (Src0Literal && !ST.hasVOP3Literal()) return nullptr; - unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64 - : IsF64 ? AMDGPU::V_FMA_F64_e64 - : IsLegacy - ? AMDGPU::V_FMA_LEGACY_F32_e64 - : AMDGPU::V_FMA_F32_e64 - : IsF16 ? AMDGPU::V_MAD_F16_e64 - : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64 - : AMDGPU::V_MAD_F32_e64; + unsigned NewOpc = getNewFMAInst(ST, Opc); + if (pseudoToMCOpcode(NewOpc) == -1) return nullptr; @@ -6866,9 +6890,8 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); if (RsrcIdx != -1) { MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); - if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) { + if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg())) isRsrcLegal = false; - } } // The operands are legal. @@ -9294,6 +9317,7 @@ static bool isRenamedInGFX9(int Opcode) { case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64: case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64: case AMDGPU::V_FMA_F16_gfx9_e64: + case AMDGPU::V_FMA_F16_gfx9_fake16_e64: case AMDGPU::V_INTERP_P2_F16: case AMDGPU::V_MAD_F16_e64: case AMDGPU::V_MAD_U16_e64: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 789ce8815cf8..ee83dff227a8 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2674,8 +2674,8 @@ let OtherPredicates = [NotHasTrue16BitInsts] in { } // end OtherPredicates = [NotHasTrue16BitInsts] let OtherPredicates = [HasTrue16BitInsts] in { - def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; - def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; + def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; + def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; } // end OtherPredicates = [HasTrue16BitInsts] def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; @@ -3055,7 +3055,7 @@ def : GCNPat< (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>; // If fcanonicalize's operand is implicitly canonicalized, we only need a copy. -let AddedComplexity = 1000 in { +let AddedComplexity = 8 in { foreach vt = [f16, v2f16, f32, v2f32, f64] in { def : GCNPat< (fcanonicalize (vt is_canonicalized:$src)), @@ -3710,12 +3710,15 @@ def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>; def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>; def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>; def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; -def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; -def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>; def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; -def : FPMinCanonMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; -def : FPMinCanonMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; +} + +let True16Predicate = UseFakeTrue16Insts in { +def : FPMinMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; +def : FPMinCanonMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinCanonMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; } let SubtargetPredicate = isGFX9Plus in { @@ -3723,6 +3726,10 @@ let True16Predicate = NotHasTrue16BitInsts in { defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, VSrc_b16>; defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax, VSrc_b16>; } +let True16Predicate = UseRealTrue16Insts in { + defm : Int16Med3Pat<V_MED3_I16_t16_e64, smin, smax, VSrcT_b16>; + defm : Int16Med3Pat<V_MED3_U16_t16_e64, umin, umax, VSrcT_b16>; +} let True16Predicate = UseFakeTrue16Insts in { defm : Int16Med3Pat<V_MED3_I16_fake16_e64, smin, smax, VSrc_b16>; defm : Int16Med3Pat<V_MED3_U16_fake16_e64, umin, umax, VSrc_b16>; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 16a7a9cfbc49..7c98ccddb5dd 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -153,14 +153,16 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> } multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1, - bit isVGPR = 0, bit isAGPR = 0> { + bit isVGPR = 0, bit isAGPR = 0, + list<int> DwarfEncodings = [-1, -1]> { def _LO16 : SIReg<n#".l", regIdx, isVGPR, isAGPR>; def _HI16 : SIReg<!if(ArtificialHigh, "", n#".h"), regIdx, isVGPR, isAGPR, /* isHi16 */ 1> { let isArtificial = ArtificialHigh; } def "" : RegisterWithSubRegs<n, [!cast<Register>(NAME#"_LO16"), - !cast<Register>(NAME#"_HI16")]> { + !cast<Register>(NAME#"_HI16")]>, + DwarfRegNum<DwarfEncodings> { let Namespace = "AMDGPU"; let SubRegIndices = [lo16, hi16]; let CoveredBySubRegs = !not(ArtificialHigh); @@ -197,7 +199,8 @@ def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { let HWEncoding = VCC_LO.HWEncoding; } -defm EXEC_LO : SIRegLoHi16<"exec_lo", 126>, DwarfRegNum<[1, 1]>; +defm EXEC_LO : SIRegLoHi16<"exec_lo", 126, /*ArtificialHigh=*/1, /*isVGPR=*/0, + /*isAGPR=*/0, /*DwarfEncodings=*/[1, 1]>; defm EXEC_HI : SIRegLoHi16<"exec_hi", 127>; def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, DwarfRegNum<[17, 1]> { @@ -337,25 +340,26 @@ def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>; // SGPR registers foreach Index = 0...105 in { defm SGPR#Index : - SIRegLoHi16 <"s"#Index, Index>, - DwarfRegNum<[!if(!le(Index, 63), !add(Index, 32), !add(Index, 1024)), - !if(!le(Index, 63), !add(Index, 32), !add(Index, 1024))]>; + SIRegLoHi16 <"s"#Index, Index, /*ArtificialHigh=*/1, + /*isVGPR=*/0, /*isAGPR=*/0, /*DwarfEncodings=*/ + [!if(!le(Index, 63), !add(Index, 32), !add(Index, 1024)), + !if(!le(Index, 63), !add(Index, 32), !add(Index, 1024))]>; } // VGPR registers foreach Index = 0...255 in { defm VGPR#Index : - SIRegLoHi16 <"v"#Index, Index, /* ArtificialHigh= */ 0, - /* isVGPR= */ 1, /* isAGPR= */ 0>, - DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]>; + SIRegLoHi16 <"v"#Index, Index, /*ArtificialHigh=*/ 0, + /*isVGPR=*/ 1, /*isAGPR=*/ 0, /*DwarfEncodings=*/ + [!add(Index, 2560), !add(Index, 1536)]>; } // AccVGPR registers foreach Index = 0...255 in { defm AGPR#Index : - SIRegLoHi16 <"a"#Index, Index, /* ArtificialHigh= */ 1, - /* isVGPR= */ 0, /* isAGPR= */ 1>, - DwarfRegNum<[!add(Index, 3072), !add(Index, 2048)]>; + SIRegLoHi16 <"a"#Index, Index, /*ArtificialHigh=*/ 1, + /*isVGPR=*/ 0, /*isAGPR=*/ 1, /*DwarfEncodings=*/ + [!add(Index, 3072), !add(Index, 2048)]>; } //===----------------------------------------------------------------------===// @@ -809,6 +813,9 @@ def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, let BaseClassOrder = 32; } +def SGPR_NULL128 : SIReg<"null">; +def SGPR_NULL256 : SIReg<"null">; + let GeneratePressureSet = 0 in { def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, (add SReg_32, LDS_DIRECT_CLASS)> { @@ -885,6 +892,7 @@ multiclass SRegClass<int numRegs, list<ValueType> regTypes, SIRegisterTuples regList, SIRegisterTuples ttmpList = regList, + bit hasNull = 0, int copyCost = !sra(!add(numRegs, 1), 1)> { defvar hasTTMP = !ne(regList, ttmpList); defvar suffix = !cast<string>(!mul(numRegs, 32)); @@ -901,7 +909,7 @@ multiclass SRegClass<int numRegs, } } - def SReg_ # suffix : + def SReg_ # suffix # !if(hasNull, "_XNULL", ""): SIRegisterClass<"AMDGPU", regTypes, 32, !con((add !cast<RegisterClass>(sgprName)), !if(hasTTMP, @@ -910,15 +918,24 @@ multiclass SRegClass<int numRegs, let isAllocatable = 0; let BaseClassOrder = !mul(numRegs, 32); } + + if hasNull then { + def SReg_ # suffix : + SIRegisterClass<"AMDGPU", regTypes, 32, + (add !cast<RegisterClass>("SReg_" # suffix # "_XNULL"), !cast<Register>("SGPR_NULL" # suffix))> { + let isAllocatable = 0; + let BaseClassOrder = !mul(numRegs, 32); + } + } } } defm "" : SRegClass<3, Reg96Types.types, SGPR_96Regs, TTMP_96Regs>; -defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs>; +defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs, /*hasNull*/ true>; defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; -defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs>; +defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs, /*hasNull*/ true>; defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>; defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>; defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>; diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 42df4576a774..979812e07fc3 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -455,6 +455,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { break; case AMDGPU::V_FMA_F16_e64: case AMDGPU::V_FMA_F16_gfx9_e64: + case AMDGPU::V_FMA_F16_gfx9_fake16_e64: NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16 : AMDGPU::V_FMAAK_F16; break; @@ -484,6 +485,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { break; case AMDGPU::V_FMA_F16_e64: case AMDGPU::V_FMA_F16_gfx9_e64: + case AMDGPU::V_FMA_F16_gfx9_fake16_e64: NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16 : AMDGPU::V_FMAMK_F16; break; @@ -956,7 +958,8 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || - MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) { + MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 || + MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) { shrinkMadFma(MI); continue; } diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 1aeb4e8b20e8..37dcc1008625 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -332,19 +332,19 @@ defm S_LOAD_I16 : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>; defm S_LOAD_U16 : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>; let is_buffer = 1 in { -defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>; // FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on // SI/CI, bit disallowed for SMEM on VI. -defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_64_XEXEC>; let SubtargetPredicate = HasScalarDwordx3Loads in - defm S_BUFFER_LOAD_DWORDX3 : SM_Pseudo_Loads <SReg_128, SReg_96>; -defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128, SReg_128>; -defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128, SReg_256>; -defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128, SReg_512>; -defm S_BUFFER_LOAD_I8 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_LOAD_U8 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_LOAD_I16 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_LOAD_U16 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; + defm S_BUFFER_LOAD_DWORDX3 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_96>; +defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_128>; +defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_256>; +defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_512>; +defm S_BUFFER_LOAD_I8 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_LOAD_U8 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_LOAD_I16 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_LOAD_U16 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>; } let SubtargetPredicate = HasScalarStores in { @@ -353,9 +353,9 @@ defm S_STORE_DWORDX2 : SM_Pseudo_Stores <SReg_64, SReg_64_XEXEC>; defm S_STORE_DWORDX4 : SM_Pseudo_Stores <SReg_64, SReg_128>; let is_buffer = 1 in { -defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores <SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <SReg_128, SReg_128>; +defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores <SReg_128_XNULL, SReg_64_XEXEC>; +defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <SReg_128_XNULL, SReg_128>; } } // End SubtargetPredicate = HasScalarStores @@ -375,7 +375,7 @@ def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb defm S_ATC_PROBE : SM_Pseudo_Probe <SReg_64>; let is_buffer = 1 in { -defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <SReg_128>; +defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <SReg_128_XNULL>; } } // SubtargetPredicate = isGFX8Plus @@ -401,33 +401,33 @@ defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <SReg_64, SReg_128>; let SubtargetPredicate = HasScalarAtomics in { let is_buffer = 1 in { -defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_SUB : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_SMIN : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_UMIN : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_SMAX : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_UMAX : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_AND : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_OR : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_XOR : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_INC : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_DEC : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; - -defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <SReg_128, SReg_128>; -defm S_BUFFER_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_AND_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_SUB : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_SMIN : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_UMIN : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_SMAX : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_UMAX : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_AND : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_OR : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_XOR : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_INC : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_DEC : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>; + +defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_128>; +defm S_BUFFER_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_AND_X2 : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>; } defm S_ATOMIC_SWAP : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>; @@ -470,7 +470,7 @@ def S_PREFETCH_INST : SM_Prefetch_Pseudo <"s_prefetch_inst", SReg_64, 1>; def S_PREFETCH_INST_PC_REL : SM_Prefetch_Pseudo <"s_prefetch_inst_pc_rel", SReg_64, 0>; def S_PREFETCH_DATA : SM_Prefetch_Pseudo <"s_prefetch_data", SReg_64, 1>; def S_PREFETCH_DATA_PC_REL : SM_Prefetch_Pseudo <"s_prefetch_data_pc_rel", SReg_64, 0>; -def S_BUFFER_PREFETCH_DATA : SM_Prefetch_Pseudo <"s_buffer_prefetch_data", SReg_128, 1> { +def S_BUFFER_PREFETCH_DATA : SM_Prefetch_Pseudo <"s_buffer_prefetch_data", SReg_128_XNULL, 1> { let is_buffer = 1; } } // end let SubtargetPredicate = isGFX12Plus diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 1dd39be9e8d9..b9c73e6ce8ef 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1018,9 +1018,9 @@ defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03b, defm V_SWAP_B16 : VOP1Only_Real_gfx11_gfx12<0x066>; defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>; defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">; -defm V_NOT_B16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x069, "v_not_b16">; -defm V_CVT_I32_I16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">; -defm V_CVT_U32_U16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">; +defm V_NOT_B16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x069, "v_not_b16">; +defm V_CVT_I32_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">; +defm V_CVT_U32_U16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">; defm V_CVT_F16_U16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x050, "v_cvt_f16_u16">; defm V_CVT_F16_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x051, "v_cvt_f16_i16">; @@ -1036,18 +1036,18 @@ defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16" defm V_LOG_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">; defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">; defm V_EXP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">; -defm V_FREXP_MANT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">; +defm V_FREXP_MANT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x059, "v_frexp_mant_f16">; defm V_FREXP_EXP_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">; defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">; defm V_FLOOR_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">; defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">; defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">; -defm V_TRUNC_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05d, "v_trunc_f16">; -defm V_RNDNE_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">; -defm V_FRACT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">; -defm V_SIN_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">; -defm V_COS_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">; -defm V_SAT_PK_U8_I16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">; +defm V_TRUNC_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d, "v_trunc_f16">; +defm V_RNDNE_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05e, "v_rndne_f16">; +defm V_FRACT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05f, "v_fract_f16">; +defm V_SIN_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x060, "v_sin_f16">; +defm V_COS_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x061, "v_cos_f16">; +defm V_SAT_PK_U8_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">; defm V_CVT_NORM_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">; defm V_CVT_NORM_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x064, "v_cvt_norm_u16_f16">; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 22e457674c07..24a2eede9ca3 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -340,7 +340,7 @@ let FPDPRounding = 1 in { let SubtargetPredicate = isGFX9Plus in { defm V_DIV_FIXUP_F16_gfx9 : VOP3Inst_t16 <"v_div_fixup_f16_gfx9", VOP_F16_F16_F16_F16, AMDGPUdiv_fixup>; - defm V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, any_fma>; + defm V_FMA_F16_gfx9 : VOP3Inst_t16 <"v_fma_f16_gfx9", VOP_F16_F16_F16_F16, any_fma>; } // End SubtargetPredicate = isGFX9Plus } // End FPDPRounding = 1 @@ -1374,8 +1374,8 @@ class VOP3_DOT_Profile_fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR let SubtargetPredicate = isGFX11Plus in { defm V_MAXMIN_F32 : VOP3Inst<"v_maxmin_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; defm V_MINMAX_F32 : VOP3Inst<"v_minmax_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; - defm V_MAXMIN_F16 : VOP3Inst<"v_maxmin_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>; - defm V_MINMAX_F16 : VOP3Inst<"v_minmax_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>; + defm V_MAXMIN_F16 : VOP3Inst_t16<"v_maxmin_f16", VOP_F16_F16_F16_F16>; + defm V_MINMAX_F16 : VOP3Inst_t16<"v_minmax_f16", VOP_F16_F16_F16_F16>; defm V_MAXMIN_U32 : VOP3Inst<"v_maxmin_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; defm V_MINMAX_U32 : VOP3Inst<"v_minmax_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; defm V_MAXMIN_I32 : VOP3Inst<"v_maxmin_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>; @@ -1578,8 +1578,8 @@ def : MinimumMaximumByMinimum3Maximum3<fmaximum, f32, V_MAXIMUM3_F32_e64>; defm V_MIN3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x229, "V_MIN3_F32", "v_min3_num_f32">; defm V_MAX3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x22a, "V_MAX3_F32", "v_max3_num_f32">; -defm V_MIN3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22b, "V_MIN3_F16", "v_min3_num_f16">; -defm V_MAX3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22c, "V_MAX3_F16", "v_max3_num_f16">; +defm V_MIN3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_num_f16", "V_MIN3_F16", "v_min3_f16">; +defm V_MAX3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">; defm V_MINIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22d>; defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22e>; defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x22f>; @@ -1588,8 +1588,8 @@ defm V_MED3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32", defm V_MED3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x232, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">; defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">; defm V_MAXMIN_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x269, "V_MAXMIN_F32", "v_maxmin_num_f32">; -defm V_MINMAX_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26a, "V_MINMAX_F16", "v_minmax_num_f16">; -defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26b, "V_MAXMIN_F16", "v_maxmin_num_f16">; +defm V_MINMAX_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x26a, "v_minmax_num_f16", "V_MINMAX_F16", "v_minmax_f16">; +defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x26b, "v_maxmin_num_f16", "V_MAXMIN_F16", "v_maxmin_f16">; defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26c>; defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26d>; defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26e>; @@ -1708,7 +1708,7 @@ defm V_PERM_B32 : VOP3_Realtriple_gfx11_gfx12<0x244>; defm V_XAD_U32 : VOP3_Realtriple_gfx11_gfx12<0x245>; defm V_LSHL_ADD_U32 : VOP3_Realtriple_gfx11_gfx12<0x246>; defm V_ADD_LSHL_U32 : VOP3_Realtriple_gfx11_gfx12<0x247>; -defm V_FMA_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x248, "V_FMA_F16_gfx9", "v_fma_f16">; +defm V_FMA_F16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x248, "v_fma_f16", "V_FMA_F16_gfx9">; defm V_MIN3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11<0x249, "v_min3_f16">; defm V_MIN3_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x24a, "v_min3_i16">; defm V_MIN3_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x24b, "v_min3_u16">; @@ -1730,8 +1730,8 @@ defm V_PERMLANE16_B32 : VOP3_Real_Base_gfx11_gfx12<0x25b>; defm V_PERMLANEX16_B32 : VOP3_Real_Base_gfx11_gfx12<0x25c>; defm V_MAXMIN_F32 : VOP3_Realtriple_gfx11<0x25e>; defm V_MINMAX_F32 : VOP3_Realtriple_gfx11<0x25f>; -defm V_MAXMIN_F16 : VOP3_Realtriple_gfx11<0x260>; -defm V_MINMAX_F16 : VOP3_Realtriple_gfx11<0x261>; +defm V_MAXMIN_F16 : VOP3_Realtriple_t16_and_fake16_gfx11<0x260, "v_maxmin_f16">; +defm V_MINMAX_F16 : VOP3_Realtriple_t16_and_fake16_gfx11<0x261, "v_minmax_f16">; defm V_MAXMIN_U32 : VOP3_Realtriple_gfx11_gfx12<0x262>; defm V_MINMAX_U32 : VOP3_Realtriple_gfx11_gfx12<0x263>; defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11_gfx12<0x264>; diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 9bf043ea334f..8589d598f587 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1130,20 +1130,20 @@ defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>; defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>; defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>; -let OtherPredicates = [HasTrue16BitInsts] in { -defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_t16_e64, i16>; -defm : ICMP_Pattern <COND_NE, V_CMP_NE_U16_t16_e64, i16>; -defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_t16_e64, i16>; -defm : ICMP_Pattern <COND_UGE, V_CMP_GE_U16_t16_e64, i16>; -defm : ICMP_Pattern <COND_ULT, V_CMP_LT_U16_t16_e64, i16>; -defm : ICMP_Pattern <COND_ULE, V_CMP_LE_U16_t16_e64, i16>; -defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_t16_e64, i16>; -defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_t16_e64, i16>; -defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_t16_e64, i16>; -defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_t16_e64, i16>; -} // End OtherPredicates = [HasTrue16BitInsts] - -let OtherPredicates = [NotHasTrue16BitInsts] in { +let True16Predicate = UseFakeTrue16Insts in { +defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_fake16_e64, i16>; +defm : ICMP_Pattern <COND_NE, V_CMP_NE_U16_fake16_e64, i16>; +defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_fake16_e64, i16>; +defm : ICMP_Pattern <COND_UGE, V_CMP_GE_U16_fake16_e64, i16>; +defm : ICMP_Pattern <COND_ULT, V_CMP_LT_U16_fake16_e64, i16>; +defm : ICMP_Pattern <COND_ULE, V_CMP_LE_U16_fake16_e64, i16>; +defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_fake16_e64, i16>; +defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_fake16_e64, i16>; +defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_fake16_e64, i16>; +defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_fake16_e64, i16>; +} // End True16Predicate = UseFakeTrue16Insts + +let True16Predicate = NotHasTrue16BitInsts in { defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_e64, i16>; defm : ICMP_Pattern <COND_NE, V_CMP_NE_U16_e64, i16>; defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_e64, i16>; @@ -1154,7 +1154,7 @@ defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_e64, i16>; defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_e64, i16>; defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_e64, i16>; defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_e64, i16>; -} // End OtherPredicates = [NotHasTrue16BitInsts] +} // End True16Predicate = NotHasTrue16BitInsts multiclass FCMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> { let WaveSizePredicate = isWave64 in @@ -1215,25 +1215,25 @@ defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>; defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>; defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>; -let OtherPredicates = [HasTrue16BitInsts] in { -defm : FCMP_Pattern <COND_O, V_CMP_O_F16_t16_e64, f16>; -defm : FCMP_Pattern <COND_UO, V_CMP_U_F16_t16_e64, f16>; -defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_t16_e64, f16>; -defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_t16_e64, f16>; -defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_t16_e64, f16>; -defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F16_t16_e64, f16>; -defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F16_t16_e64, f16>; -defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F16_t16_e64, f16>; - -defm : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F16_t16_e64, f16>; -defm : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F16_t16_e64, f16>; -defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_t16_e64, f16>; -defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_t16_e64, f16>; -defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_t16_e64, f16>; -defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_t16_e64, f16>; -} // End OtherPredicates = [HasTrue16BitInsts] - -let OtherPredicates = [NotHasTrue16BitInsts] in { +let True16Predicate = UseFakeTrue16Insts in { +defm : FCMP_Pattern <COND_O, V_CMP_O_F16_fake16_e64, f16>; +defm : FCMP_Pattern <COND_UO, V_CMP_U_F16_fake16_e64, f16>; +defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_fake16_e64, f16>; +defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_fake16_e64, f16>; +defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_fake16_e64, f16>; +defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F16_fake16_e64, f16>; +defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F16_fake16_e64, f16>; +defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F16_fake16_e64, f16>; + +defm : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F16_fake16_e64, f16>; +defm : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F16_fake16_e64, f16>; +defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_fake16_e64, f16>; +defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_fake16_e64, f16>; +defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_fake16_e64, f16>; +defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_fake16_e64, f16>; +} // End True16Predicate = UseFakeTrue16Insts + +let True16Predicate = NotHasTrue16BitInsts in { defm : FCMP_Pattern <COND_O, V_CMP_O_F16_e64, f16>; defm : FCMP_Pattern <COND_UO, V_CMP_U_F16_e64, f16>; defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_e64, f16>; @@ -1249,7 +1249,7 @@ defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_e64, f16>; defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_e64, f16>; defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_e64, f16>; defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_e64, f16>; -} // End OtherPredicates = [NotHasTrue16BitInsts] +} // End True16Predicate = NotHasTrue16BitInsts //===----------------------------------------------------------------------===// // DPP Encodings @@ -1707,23 +1707,6 @@ multiclass VOPCX_Real_t16_gfx11_gfx12<bits<9> op, string asm_name, VOPCX_Real_t16<GFX11Gen, op, asm_name, OpName, pseudo_mnemonic>, VOPCX_Real_t16<GFX12Gen, op, asm_name, OpName, pseudo_mnemonic>; -defm V_CMP_F_F16_t16 : VOPC_Real_t16_gfx11<0x000, "v_cmp_f_f16">; -defm V_CMP_LT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x001, "v_cmp_lt_f16">; -defm V_CMP_EQ_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x002, "v_cmp_eq_f16">; -defm V_CMP_LE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x003, "v_cmp_le_f16">; -defm V_CMP_GT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x004, "v_cmp_gt_f16">; -defm V_CMP_LG_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x005, "v_cmp_lg_f16">; -defm V_CMP_GE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x006, "v_cmp_ge_f16">; -defm V_CMP_O_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x007, "v_cmp_o_f16">; -defm V_CMP_U_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x008, "v_cmp_u_f16">; -defm V_CMP_NGE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x009, "v_cmp_nge_f16">; -defm V_CMP_NLG_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00a, "v_cmp_nlg_f16">; -defm V_CMP_NGT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00b, "v_cmp_ngt_f16">; -defm V_CMP_NLE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00c, "v_cmp_nle_f16">; -defm V_CMP_NEQ_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00d, "v_cmp_neq_f16">; -defm V_CMP_NLT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00e, "v_cmp_nlt_f16">; -defm V_CMP_T_F16_t16 : VOPC_Real_t16_gfx11<0x00f, "v_cmp_t_f16", "V_CMP_TRU_F16_t16", "v_cmp_tru_f16">; - defm V_CMP_F_F16_fake16 : VOPC_Real_t16_gfx11<0x000, "v_cmp_f_f16">; defm V_CMP_LT_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x001, "v_cmp_lt_f16">; defm V_CMP_EQ_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x002, "v_cmp_eq_f16">; @@ -1759,19 +1742,6 @@ defm V_CMP_NLT_F32 : VOPC_Real_gfx11_gfx12<0x01e>; defm V_CMP_T_F32 : VOPC_Real_with_name_gfx11<0x01f, "V_CMP_TRU_F32", "v_cmp_t_f32">; defm V_CMP_T_F64 : VOPC_Real_with_name_gfx11<0x02f, "V_CMP_TRU_F64", "v_cmp_t_f64">; -defm V_CMP_LT_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x031, "v_cmp_lt_i16">; -defm V_CMP_EQ_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x032, "v_cmp_eq_i16">; -defm V_CMP_LE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x033, "v_cmp_le_i16">; -defm V_CMP_GT_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x034, "v_cmp_gt_i16">; -defm V_CMP_NE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x035, "v_cmp_ne_i16">; -defm V_CMP_GE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x036, "v_cmp_ge_i16">; -defm V_CMP_LT_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x039, "v_cmp_lt_u16">; -defm V_CMP_EQ_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03a, "v_cmp_eq_u16">; -defm V_CMP_LE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03b, "v_cmp_le_u16">; -defm V_CMP_GT_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03c, "v_cmp_gt_u16">; -defm V_CMP_NE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03d, "v_cmp_ne_u16">; -defm V_CMP_GE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03e, "v_cmp_ge_u16">; - defm V_CMP_LT_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x031, "v_cmp_lt_i16">; defm V_CMP_EQ_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x032, "v_cmp_eq_i16">; defm V_CMP_LE_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x033, "v_cmp_le_i16">; @@ -1819,28 +1789,10 @@ defm V_CMP_NE_U64 : VOPC_Real_gfx11_gfx12<0x05d>; defm V_CMP_GE_U64 : VOPC_Real_gfx11_gfx12<0x05e>; defm V_CMP_T_U64 : VOPC_Real_gfx11<0x05f>; -defm V_CMP_CLASS_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x07d, "v_cmp_class_f16">; defm V_CMP_CLASS_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x07d, "v_cmp_class_f16">; defm V_CMP_CLASS_F32 : VOPC_Real_gfx11_gfx12<0x07e>; defm V_CMP_CLASS_F64 : VOPC_Real_gfx11_gfx12<0x07f>; -defm V_CMPX_F_F16_t16 : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">; -defm V_CMPX_LT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">; -defm V_CMPX_EQ_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">; -defm V_CMPX_LE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x083, "v_cmpx_le_f16">; -defm V_CMPX_GT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x084, "v_cmpx_gt_f16">; -defm V_CMPX_LG_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x085, "v_cmpx_lg_f16">; -defm V_CMPX_GE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x086, "v_cmpx_ge_f16">; -defm V_CMPX_O_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x087, "v_cmpx_o_f16">; -defm V_CMPX_U_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x088, "v_cmpx_u_f16">; -defm V_CMPX_NGE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x089, "v_cmpx_nge_f16">; -defm V_CMPX_NLG_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08a, "v_cmpx_nlg_f16">; -defm V_CMPX_NGT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08b, "v_cmpx_ngt_f16">; -defm V_CMPX_NLE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08c, "v_cmpx_nle_f16">; -defm V_CMPX_NEQ_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08d, "v_cmpx_neq_f16">; -defm V_CMPX_NLT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08e, "v_cmpx_nlt_f16">; -defm V_CMPX_T_F16_t16 : VOPCX_Real_with_name_gfx11<0x08f, "V_CMPX_TRU_F16_t16", "v_cmpx_t_f16", "v_cmpx_tru_f16">; - defm V_CMPX_F_F16_fake16 : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">; defm V_CMPX_LT_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">; defm V_CMPX_EQ_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">; @@ -1892,19 +1844,6 @@ defm V_CMPX_NEQ_F64 : VOPCX_Real_gfx11_gfx12<0x0ad>; defm V_CMPX_NLT_F64 : VOPCX_Real_gfx11_gfx12<0x0ae>; defm V_CMPX_T_F64 : VOPCX_Real_with_name_gfx11<0x0af, "V_CMPX_TRU_F64", "v_cmpx_t_f64">; -defm V_CMPX_LT_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b1, "v_cmpx_lt_i16">; -defm V_CMPX_EQ_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b2, "v_cmpx_eq_i16">; -defm V_CMPX_LE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b3, "v_cmpx_le_i16">; -defm V_CMPX_GT_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b4, "v_cmpx_gt_i16">; -defm V_CMPX_NE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b5, "v_cmpx_ne_i16">; -defm V_CMPX_GE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b6, "v_cmpx_ge_i16">; -defm V_CMPX_LT_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b9, "v_cmpx_lt_u16">; -defm V_CMPX_EQ_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0ba, "v_cmpx_eq_u16">; -defm V_CMPX_LE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bb, "v_cmpx_le_u16">; -defm V_CMPX_GT_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bc, "v_cmpx_gt_u16">; -defm V_CMPX_NE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bd, "v_cmpx_ne_u16">; -defm V_CMPX_GE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0be, "v_cmpx_ge_u16">; - defm V_CMPX_LT_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b1, "v_cmpx_lt_i16">; defm V_CMPX_EQ_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b2, "v_cmpx_eq_i16">; defm V_CMPX_LE_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b3, "v_cmpx_le_i16">; @@ -1951,7 +1890,6 @@ defm V_CMPX_GT_U64 : VOPCX_Real_gfx11_gfx12<0x0dc>; defm V_CMPX_NE_U64 : VOPCX_Real_gfx11_gfx12<0x0dd>; defm V_CMPX_GE_U64 : VOPCX_Real_gfx11_gfx12<0x0de>; defm V_CMPX_T_U64 : VOPCX_Real_gfx11<0x0df>; -defm V_CMPX_CLASS_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0fd, "v_cmpx_class_f16">; defm V_CMPX_CLASS_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0fd, "v_cmpx_class_f16">; defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx11_gfx12<0x0fe>; defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11_gfx12<0x0ff>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index d236907b0eec..930ed9a5e2d0 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1909,8 +1909,8 @@ multiclass VOP3_Realtriple_t16_gfx11<bits<10> op, string asmName, string opName multiclass VOP3_Realtriple_t16_and_fake16_gfx11<bits<10> op, string asmName, string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> { - defm _t16: VOP3_Realtriple_t16_gfx11<op, opName#"_t16", asmName, pseudo_mnemonic, isSingle>; - defm _fake16: VOP3_Realtriple_t16_gfx11<op, opName#"_fake16", asmName, pseudo_mnemonic, isSingle>; + defm _t16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>; + defm _fake16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>; } multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName, |
