diff options
| author | Peter Collingbourne <peter@pcc.me.uk> | 2025-07-18 13:26:00 -0700 |
|---|---|---|
| committer | Peter Collingbourne <peter@pcc.me.uk> | 2025-07-18 13:26:00 -0700 |
| commit | 9bf3524731070cadc6175707314f3b6ca37190d5 (patch) | |
| tree | 86dcab7604336b01ae938fe81062c29ff69efba8 /llvm/lib/Target/AMDGPU | |
| parent | 3a84c15cc13b6daf8e812592898ab6c7f19091a9 (diff) | |
| parent | 4f43f0606c3d7e1ce6d069583b5e59f036e112ce (diff) | |
Created using spr 1.3.6-beta.1
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
63 files changed, 2258 insertions, 1353 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 23f106a9c1d4..007b481f8496 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -153,6 +153,9 @@ private: const TargetMachine &TM; }; +void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &); +extern char &AMDGPUPrepareAGPRAllocLegacyID; + void initializeAMDGPUReserveWWMRegsLegacyPass(PassRegistry &); extern char &AMDGPUReserveWWMRegsLegacyID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 31420caca089..0e0e83b7a6b5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -89,6 +89,12 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch", "Use scratch_* flat memory instructions to access scratch" >; +def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode", + "FlatGVSMode", + "true", + "Have GVS addressing mode with flat_* instructions" +>; + def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", "AddNoCarryInsts", "true", @@ -541,6 +547,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16", "Use true 16-bit registers" >; +def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts", + "HasBF16TransInsts", + "true", + "Has bf16 transcendental instructions" +>; + def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts", "HasBF16ConversionInsts", "true", @@ -1106,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts", "Has v_bitop3_b32/v_bitop3_b16 instructions" >; +def FeatureTanhInsts : SubtargetFeature<"tanh-insts", + "HasTanhInsts", + "true", + "Has v_tanh_f32/f16 instructions" +>; + def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts", "HasTransposeLoadF4F6Insts", "true", @@ -1948,6 +1966,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureShaderCyclesHiLoRegisters, FeatureArchitectedFlatScratch, FeatureArchitectedSGPRs, + FeatureFlatGVSMode, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, FeatureAtomicDsPkAdd16Insts, @@ -1966,7 +1985,9 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureScalarDwordx3Loads, FeatureDPPSrc1SGPR, FeatureBitOp3Insts, + FeatureTanhInsts, FeatureTransposeLoadF4F6Insts, + FeatureBF16TransInsts, FeatureBF16ConversionInsts, FeatureCvtPkF16F32Inst, FeatureMinimum3Maximum3PKF16, @@ -2374,6 +2395,9 @@ def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">, def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">, AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>; +def HasFlatGVSMode : Predicate<"Subtarget->hasFlatGVSMode()">, + AssemblerPredicate<(all_of FeatureFlatGVSMode)>; + def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">, AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>; @@ -2442,6 +2466,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() && // FIXME When we default to RealTrue16 instead of Fake, change the line as follows. // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>; +def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">, + AssemblerPredicate<(all_of FeatureBF16TransInsts)>; + def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">, AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>; @@ -2657,6 +2684,9 @@ def HasDefaultComponentBroadcast def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">, AssemblerPredicate<(all_of FeatureDsSrc2Insts)>; +def HasAddPC64Inst : Predicate<"Subtarget->hasAddPC64Inst()">, + AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">; def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">; @@ -2680,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">, def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">, AssemblerPredicate<(all_of FeatureBitOp3Insts)>; +def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">, + AssemblerPredicate<(all_of FeatureTanhInsts)>; + def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">, AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 79cf49f88d6d..dedee46a4423 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -13,11 +13,9 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/Analysis/CycleAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" -#include "llvm/InitializePasses.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/IPO/Attributor.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 22b921fb2084..5f1983791cfa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -45,12 +45,6 @@ static cl::opt<bool> WidenLoads( cl::ReallyHidden, cl::init(false)); -static cl::opt<bool> Widen16BitOps( - "amdgpu-codegenprepare-widen-16-bit-ops", - cl::desc( - "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"), - cl::ReallyHidden, cl::init(false)); - static cl::opt<bool> BreakLargePHIs("amdgpu-codegenprepare-break-large-phis", cl::desc("Break large PHI nodes for DAGISel"), @@ -150,18 +144,6 @@ public: bool canBreakPHINode(const PHINode &I); - /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to - /// binary operation \p V. - /// - /// \returns Binary operation \p V. - /// \returns \p T's base element bit width. - unsigned getBaseElementBitWidth(const Type *T) const; - - /// \returns Equivalent 32 bit integer type for given type \p T. For example, - /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> - /// is returned. - Type *getI32Ty(IRBuilder<> &B, const Type *T) const; - /// \returns True if binary operation \p I is a signed binary operation, false /// otherwise. bool isSigned(const BinaryOperator &I) const; @@ -170,10 +152,6 @@ public: /// signed 'icmp' operation, false otherwise. bool isSigned(const SelectInst &I) const; - /// \returns True if type \p T needs to be promoted to 32 bit integer type, - /// false otherwise. - bool needsPromotionToI32(const Type *T) const; - /// Return true if \p T is a legal scalar floating point type. bool isLegalFloatingTy(const Type *T) const; @@ -188,52 +166,6 @@ public: computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal(); } - /// Promotes uniform binary operation \p I to equivalent 32 bit binary - /// operation. - /// - /// \details \p I's base element bit width must be greater than 1 and less - /// than or equal 16. Promotion is done by sign or zero extending operands to - /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and - /// truncating the result of 32 bit binary operation back to \p I's original - /// type. Division operation is not promoted. - /// - /// \returns True if \p I is promoted to equivalent 32 bit binary operation, - /// false otherwise. - bool promoteUniformOpToI32(BinaryOperator &I) const; - - /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. - /// - /// \details \p I's base element bit width must be greater than 1 and less - /// than or equal 16. Promotion is done by sign or zero extending operands to - /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. - /// - /// \returns True. - bool promoteUniformOpToI32(ICmpInst &I) const; - - /// Promotes uniform 'select' operation \p I to 32 bit 'select' - /// operation. - /// - /// \details \p I's base element bit width must be greater than 1 and less - /// than or equal 16. Promotion is done by sign or zero extending operands to - /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the - /// result of 32 bit 'select' operation back to \p I's original type. - /// - /// \returns True. - bool promoteUniformOpToI32(SelectInst &I) const; - - /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' - /// intrinsic. - /// - /// \details \p I's base element bit width must be greater than 1 and less - /// than or equal 16. Promotion is done by zero extending the operand to 32 - /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the - /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the - /// shift amount is 32 minus \p I's base element bit width), and truncating - /// the result of the shift operation back to \p I's original type. - /// - /// \returns True. - bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; - /// \returns The minimum number of bits needed to store the value of \Op as an /// unsigned integer. Truncating to this size and then zero-extending to /// the original will not change the value. @@ -320,13 +252,11 @@ public: bool visitInstruction(Instruction &I) { return false; } bool visitBinaryOperator(BinaryOperator &I); bool visitLoadInst(LoadInst &I); - bool visitICmpInst(ICmpInst &I); bool visitSelectInst(SelectInst &I); bool visitPHINode(PHINode &I); bool visitAddrSpaceCastInst(AddrSpaceCastInst &I); bool visitIntrinsicInst(IntrinsicInst &I); - bool visitBitreverseIntrinsicInst(IntrinsicInst &I); bool visitFMinLike(IntrinsicInst &I); bool visitSqrt(IntrinsicInst &I); bool run(); @@ -380,22 +310,6 @@ bool AMDGPUCodeGenPrepareImpl::run() { return MadeChange; } -unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const { - assert(needsPromotionToI32(T) && "T does not need promotion to i32"); - - if (T->isIntegerTy()) - return T->getIntegerBitWidth(); - return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); -} - -Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const { - assert(needsPromotionToI32(T) && "T does not need promotion to i32"); - - if (T->isIntegerTy()) - return B.getInt32Ty(); - return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T)); -} - bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const { return I.getOpcode() == Instruction::AShr || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; @@ -406,59 +320,11 @@ bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const { cast<ICmpInst>(I.getOperand(0))->isSigned(); } -bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const { - if (!Widen16BitOps) - return false; - - const IntegerType *IntTy = dyn_cast<IntegerType>(T); - if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) - return true; - - if (const VectorType *VT = dyn_cast<VectorType>(T)) { - // TODO: The set of packed operations is more limited, so may want to - // promote some anyway. - if (ST.hasVOP3PInsts()) - return false; - - return needsPromotionToI32(VT->getElementType()); - } - - return false; -} - bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const { return Ty->isFloatTy() || Ty->isDoubleTy() || (Ty->isHalfTy() && ST.has16BitInsts()); } -// Return true if the op promoted to i32 should have nsw set. -static bool promotedOpIsNSW(const Instruction &I) { - switch (I.getOpcode()) { - case Instruction::Shl: - case Instruction::Add: - case Instruction::Sub: - return true; - case Instruction::Mul: - return I.hasNoUnsignedWrap(); - default: - return false; - } -} - -// Return true if the op promoted to i32 should have nuw set. -static bool promotedOpIsNUW(const Instruction &I) { - switch (I.getOpcode()) { - case Instruction::Shl: - case Instruction::Add: - case Instruction::Mul: - return true; - case Instruction::Sub: - return I.hasNoUnsignedWrap(); - default: - return false; - } -} - bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const { Type *Ty = I.getType(); int TySize = DL.getTypeSizeInBits(Ty); @@ -467,134 +333,6 @@ bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const { return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I); } -bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const { - assert(needsPromotionToI32(I.getType()) && - "I does not need promotion to i32"); - - if (I.getOpcode() == Instruction::SDiv || - I.getOpcode() == Instruction::UDiv || - I.getOpcode() == Instruction::SRem || - I.getOpcode() == Instruction::URem) - return false; - - IRBuilder<> Builder(&I); - Builder.SetCurrentDebugLocation(I.getDebugLoc()); - - Type *I32Ty = getI32Ty(Builder, I.getType()); - Value *ExtOp0 = nullptr; - Value *ExtOp1 = nullptr; - Value *ExtRes = nullptr; - Value *TruncRes = nullptr; - - if (isSigned(I)) { - ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); - ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); - } else { - ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); - ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); - } - - ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); - if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { - if (promotedOpIsNSW(cast<Instruction>(I))) - Inst->setHasNoSignedWrap(); - - if (promotedOpIsNUW(cast<Instruction>(I))) - Inst->setHasNoUnsignedWrap(); - - if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) - Inst->setIsExact(ExactOp->isExact()); - } - - TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); - - I.replaceAllUsesWith(TruncRes); - I.eraseFromParent(); - - return true; -} - -bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const { - assert(needsPromotionToI32(I.getOperand(0)->getType()) && - "I does not need promotion to i32"); - - IRBuilder<> Builder(&I); - Builder.SetCurrentDebugLocation(I.getDebugLoc()); - - Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); - Value *ExtOp0 = nullptr; - Value *ExtOp1 = nullptr; - Value *NewICmp = nullptr; - - if (I.isSigned()) { - ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); - ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); - } else { - ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); - ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); - } - NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); - - I.replaceAllUsesWith(NewICmp); - I.eraseFromParent(); - - return true; -} - -bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const { - assert(needsPromotionToI32(I.getType()) && - "I does not need promotion to i32"); - - IRBuilder<> Builder(&I); - Builder.SetCurrentDebugLocation(I.getDebugLoc()); - - Type *I32Ty = getI32Ty(Builder, I.getType()); - Value *ExtOp1 = nullptr; - Value *ExtOp2 = nullptr; - Value *ExtRes = nullptr; - Value *TruncRes = nullptr; - - if (isSigned(I)) { - ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); - ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); - } else { - ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); - ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); - } - ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); - TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); - - I.replaceAllUsesWith(TruncRes); - I.eraseFromParent(); - - return true; -} - -bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32( - IntrinsicInst &I) const { - assert(I.getIntrinsicID() == Intrinsic::bitreverse && - "I must be bitreverse intrinsic"); - assert(needsPromotionToI32(I.getType()) && - "I does not need promotion to i32"); - - IRBuilder<> Builder(&I); - Builder.SetCurrentDebugLocation(I.getDebugLoc()); - - Type *I32Ty = getI32Ty(Builder, I.getType()); - Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); - Value *ExtRes = - Builder.CreateIntrinsic(Intrinsic::bitreverse, {I32Ty}, {ExtOp}); - Value *LShrOp = - Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); - Value *TruncRes = - Builder.CreateTrunc(LShrOp, I.getType()); - - I.replaceAllUsesWith(TruncRes); - I.eraseFromParent(); - - return true; -} - unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const { return computeKnownBits(Op, DL, AC).countMaxActiveBits(); } @@ -1635,10 +1373,6 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { if (foldBinOpIntoSelect(I)) return true; - if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) && - UA.isUniform(&I) && promoteUniformOpToI32(I)) - return true; - if (UseMul24Intrin && replaceMulWithMul24(I)) return true; if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(), @@ -1770,16 +1504,6 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) { return false; } -bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) { - bool Changed = false; - - if (ST.has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && - UA.isUniform(&I)) - Changed |= promoteUniformOpToI32(I); - - return Changed; -} - bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) { Value *Cond = I.getCondition(); Value *TrueVal = I.getTrueValue(); @@ -1787,12 +1511,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) { Value *CmpVal; CmpPredicate Pred; - if (ST.has16BitInsts() && needsPromotionToI32(I.getType())) { - if (UA.isUniform(&I)) - return promoteUniformOpToI32(I); - return false; - } - // Match fract pattern with nan check. if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN()))) return false; @@ -2196,8 +1914,6 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { - case Intrinsic::bitreverse: - return visitBitreverseIntrinsicInst(I); case Intrinsic::minnum: case Intrinsic::minimumnum: case Intrinsic::minimum: @@ -2209,16 +1925,6 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { } } -bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) { - bool Changed = false; - - if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) && - UA.isUniform(&I)) - Changed |= promoteUniformBitreverseToI32(I); - - return Changed; -} - /// Match non-nan fract pattern. /// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)) /// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 1b909568fc55..7b5d4077e85f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -55,6 +55,14 @@ def gi_vop3pmodsneg : GIComplexOperandMatcher<s32, "selectVOP3PModsNeg">, GIComplexPatternEquiv<VOP3PModsNeg>; +def gi_vop3pmodsnegs : + GIComplexOperandMatcher<s32, "selectVOP3PModsNegs">, + GIComplexPatternEquiv<VOP3PModsNegs>; + +def gi_dotiuvop3pmodsnegabs : + GIComplexOperandMatcher<s32, "selectVOP3PModsNegAbs">, + GIComplexPatternEquiv<VOP3PModsNegAbs>; + def gi_wmmaopselvop3pmods : GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">, GIComplexPatternEquiv<WMMAOpSelVOP3PMods>; @@ -83,6 +91,10 @@ def gi_swmmacindex16 : GIComplexOperandMatcher<s32, "selectSWMMACIndex16">, GIComplexPatternEquiv<SWMMACIndex16>; +def gi_swmmacindex32 : + GIComplexOperandMatcher<s64, "selectSWMMACIndex32">, + GIComplexPatternEquiv<SWMMACIndex32>; + def gi_vop3opselmods : GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">, GIComplexPatternEquiv<VOP3OpSelMods>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 202693b31612..25672a52345c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -447,6 +447,35 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { return; } + bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN(); + if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 && + CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) { + uint64_t C = 0; + bool AllConst = true; + unsigned EltSize = EltVT.getSizeInBits(); + for (unsigned I = 0; I < NumVectorElts; ++I) { + SDValue Op = N->getOperand(I); + if (Op.isUndef()) { + AllConst = false; + break; + } + uint64_t Val; + if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Op)) { + Val = CF->getValueAPF().bitcastToAPInt().getZExtValue(); + } else + Val = cast<ConstantSDNode>(Op)->getZExtValue(); + C |= Val << (EltSize * I); + } + if (AllConst) { + SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64); + MachineSDNode *Copy = + CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV); + CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0), + RegClass); + return; + } + } + assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not " "supported yet"); // 32 = Max Num Vector Elements @@ -454,7 +483,6 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { // 1 = Vector Register Class SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); - bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN(); RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); bool IsRegSeq = true; unsigned NOps = N->getNumOperands(); @@ -676,7 +704,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::Constant: case ISD::ConstantFP: { - if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) + if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) || + Subtarget->has64BitLiterals()) break; uint64_t Imm; @@ -1632,8 +1661,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset) const { - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); MachineFunction &MF = CurDAG->getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); @@ -3245,6 +3273,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src, return SelectVOP3PMods(In, Src, SrcMods, true); } +// Select neg_lo from the i1 immediate operand. bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const { const ConstantSDNode *C = cast<ConstantSDNode>(In); // Literal i1 value set in intrinsic, represents SrcMods for the next operand. @@ -3260,6 +3289,47 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const { return true; } +// Select both neg_lo and neg_hi from the i1 immediate operand. This is +// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies +// to matrix's even k elements, and neg_hi applies to matrix's odd k elements. +bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegs(SDValue In, SDValue &Src) const { + const ConstantSDNode *C = cast<ConstantSDNode>(In); + // Literal i1 value set in intrinsic, represents SrcMods for the next operand. + // 1 promotes packed values to signed, 0 treats them as unsigned. + assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); + + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned SrcSign = C->getZExtValue(); + if (SrcSign == 1) + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + + Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +// Select neg, abs, or both neg and abs from the i16 immediate operans. +bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const { + const ConstantSDNode *C = cast<ConstantSDNode>(In); + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned SrcMod = C->getZExtValue(); + switch (SrcMod) { + default: // Any other value will be silently ignored (considered as 0). + break; + case 1: + Mods ^= SISrcMods::NEG; + break; + case 2: + Mods ^= SISrcMods::ABS; + break; + case 3: + Mods ^= (SISrcMods::NEG | SISrcMods::ABS); + break; + } + + Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const { const ConstantSDNode *C = cast<ConstantSDNode>(In); @@ -3611,6 +3681,41 @@ bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src, return true; } +bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src, + SDValue &IndexKey) const { + unsigned Key = 0; + Src = In; + + SDValue InI32; + + if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) { + const SDValue &ExtendSrc = In.getOperand(0); + if (ExtendSrc.getValueSizeInBits() == 32) + InI32 = ExtendSrc; + } else if (In->getOpcode() == ISD::BITCAST) { + const SDValue &CastSrc = In.getOperand(0); + if (CastSrc.getOpcode() == ISD::BUILD_VECTOR && + CastSrc.getOperand(0).getValueSizeInBits() == 32) { + ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1)); + if (Zero && Zero->getZExtValue() == 0) + InI32 = CastSrc.getOperand(0); + } + } + + if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + const SDValue &ExtractVecEltSrc = InI32.getOperand(0); + ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1)); + if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx && + EltIdx->getZExtValue() == 1) { + Key = 1; + Src = ExtractVecEltSrc; + } + } + + IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const { Src = In; @@ -3885,10 +3990,8 @@ SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const { bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { assert(CurDAG->getTarget().getTargetTriple().isAMDGCN()); - const SIRegisterInfo *SIRI = - static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); - const SIInstrInfo * SII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); + const SIInstrInfo *SII = Subtarget->getInstrInfo(); unsigned Limit = 0; bool AllUsesAcceptSReg = true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index f3b9364fdb92..9967f46e085e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -222,6 +222,8 @@ private: bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const; + bool SelectVOP3PModsNegs(SDValue In, SDValue &Src) const; + bool SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const; bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src, @@ -233,6 +235,7 @@ private: bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const; bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const; + bool SelectSWMMACIndex32(SDValue In, SDValue &Src, SDValue &IndexKey) const; bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e64d2162441a..3d040fb705a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4006,7 +4006,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( case Intrinsic::amdgcn_rsq: case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_rsq_legacy: - case Intrinsic::amdgcn_rsq_clamp: { + case Intrinsic::amdgcn_rsq_clamp: + case Intrinsic::amdgcn_tanh: { // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted SDValue Src = N->getOperand(1); return Src.isUndef() ? Src : SDValue(); @@ -4842,11 +4843,94 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } +// Detect when CMP and SELECT use the same constant and fold them to avoid +// loading the constant twice. Specifically handles patterns like: +// %cmp = icmp eq i32 %val, 4242 +// %sel = select i1 %cmp, i32 4242, i32 %other +// It can be optimized to reuse %val instead of 4242 in select. +static SDValue +foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const AMDGPUSubtarget *ST) { + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Check if condition is a comparison. + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); + bool isInteger = LHS.getValueType().isInteger(); + + // Handle simple floating-point and integer types only. + if (!isFloatingPoint && !isInteger) + return SDValue(); + + bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); + bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); + if (!isEquality && !isNonEquality) + return SDValue(); + + SDValue ArgVal, ConstVal; + if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || + (isInteger && isa<ConstantSDNode>(RHS))) { + ConstVal = RHS; + ArgVal = LHS; + } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || + (isInteger && isa<ConstantSDNode>(LHS))) { + ConstVal = LHS; + ArgVal = RHS; + } else { + return SDValue(); + } + + // Check if constant should not be optimized - early return if not. + if (isFloatingPoint) { + const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); + const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST); + + // Only optimize normal floating-point values (finite, non-zero, and + // non-subnormal as per IEEE 754), skip optimization for inlinable + // floating-point constants. + if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val)) + return SDValue(); + } else { + int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue(); + + // Skip optimization for inlinable integer immediates. + // Inlinable immediates include: -16 to 64 (inclusive). + if (IntVal >= -16 && IntVal <= 64) + return SDValue(); + } + + // For equality and non-equality comparisons, patterns: + // select (setcc x, const), const, y -> select (setcc x, const), x, y + // select (setccinv x, const), y, const -> select (setccinv x, const), y, x + if (!(isEquality && TrueVal == ConstVal) && + !(isNonEquality && FalseVal == ConstVal)) + return SDValue(); + + SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; + SDValue SelectRHS = + (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; + return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, + SelectLHS, SelectRHS); +} + SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) return Folded; + // Try to fold CMP + SELECT patterns with shared constants (both FP and + // integer). + if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget)) + return Folded; + SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); @@ -5733,6 +5817,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) + NODE_NAME_CASE(PC_ADD_REL_OFFSET64) NODE_NAME_CASE(LDS) NODE_NAME_CASE(DUMMY_CHAIN) NODE_NAME_CASE(LOAD_D16_HI) @@ -6196,7 +6281,8 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode( case Intrinsic::amdgcn_rsq: case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_rsq_legacy: - case Intrinsic::amdgcn_rsq_clamp: { + case Intrinsic::amdgcn_rsq_clamp: + case Intrinsic::amdgcn_tanh: { if (SNaN) return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 0dd2183b72b2..4e8c6c7ea3b2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -545,6 +545,7 @@ enum NodeType : unsigned { /// Pointer to the start of the shader's constant data. CONST_DATA_PTR, PC_ADD_REL_OFFSET, + PC_ADD_REL_OFFSET64, LDS, DUMMY_CHAIN, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp index 44eaebffb70d..9a90787963d7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -25,6 +25,7 @@ namespace { class AMDGPUInsertDelayAlu { public: + const GCNSubtarget *ST; const SIInstrInfo *SII; const TargetRegisterInfo *TRI; @@ -65,13 +66,16 @@ public: // Types of delay that can be encoded in an s_delay_alu instruction. enum DelayType { VALU, TRANS, SALU, OTHER }; - // Get the delay type for an instruction with the specified TSFlags. - static DelayType getDelayType(uint64_t TSFlags) { - if (TSFlags & SIInstrFlags::TRANS) + // Get the delay type for a MachineInstr. + DelayType getDelayType(const MachineInstr &MI) { + if (SIInstrInfo::isTRANS(MI)) return TRANS; - if (TSFlags & SIInstrFlags::VALU) + // WMMA XDL ops are treated the same as TRANS. + if (AMDGPU::isGFX1250(*ST) && SII->isXDLWMMA(MI)) + return TRANS; + if (SIInstrInfo::isVALU(MI)) return VALU; - if (TSFlags & SIInstrFlags::SALU) + if (SIInstrInfo::isSALU(MI)) return SALU; return OTHER; } @@ -368,7 +372,7 @@ public: continue; } - DelayType Type = getDelayType(MI.getDesc().TSFlags); + DelayType Type = getDelayType(MI); if (instructionWaitsForSGPRWrites(MI)) { auto It = State.find(LastSGPRFromVALU); @@ -456,12 +460,12 @@ public: LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() << "\n"); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (!ST.hasDelayAlu()) + ST = &MF.getSubtarget<GCNSubtarget>(); + if (!ST->hasDelayAlu()) return false; - SII = ST.getInstrInfo(); - TRI = ST.getRegisterInfo(); + SII = ST->getInstrInfo(); + TRI = ST->getRegisterInfo(); SchedModel = &SII->getSchedModel(); // Calculate the delay state for each basic block, iterating until we reach diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index b8996fb97f1c..e2c2e8912c71 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -700,7 +700,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } case Intrinsic::amdgcn_sqrt: - case Intrinsic::amdgcn_rsq: { + case Intrinsic::amdgcn_rsq: + case Intrinsic::amdgcn_tanh: { Value *Src = II.getArgOperand(0); if (isa<PoisonValue>(Src)) return IC.replaceInstUsesWith(II, Src); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index ea79c57080fa..1a63c48e3666 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3513,6 +3513,25 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { return Register(); } +Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const { + Register AnyExtSrc; + if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc)))) + return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register(); + + // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF) + const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); + if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) + return Register(); + + assert(Def->getNumOperands() == 3 && + MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); + + if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef())) + return Def->getOperand(1).getReg(); + + return Register(); +} + bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ if (!Subtarget->hasVMemToLDSLoad()) return false; @@ -4904,6 +4923,7 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { return selectVOP3PRetHelper(Root, true); } +// Select neg_lo from the i1 immediate operand. InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const { // Literal i1 value set in intrinsic, represents SrcMods for the next operand. @@ -4919,6 +4939,50 @@ AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const { }}; } +// Select both neg_lo and neg_hi from the i1 immediate operand. This is +// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies +// to matrix's even k elements, and neg_hi applies to matrix's odd k elements. +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PModsNegs(MachineOperand &Root) const { + // Literal i1 value set in intrinsic, represents SrcMods for the next operand. + // Value is in Imm operand as i1 sign extended to int64_t. + // 1(-1) promotes packed values to signed, 0 treats them as unsigned. + assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && + "expected i1 value"); + unsigned Mods = SISrcMods::OP_SEL_1; + if (Root.getImm() == -1) + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +// Select neg, abs, or both neg and abs from the i16 immediate operans. +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PModsNegAbs(MachineOperand &Root) const { + + assert(Root.isImm() && "Modifier for C must be an immediate"); + + unsigned Mods = SISrcMods::OP_SEL_1; + switch (Root.getImm()) { + default: // Any other value will be silently ignored (considered as 0). + break; + case 1: + Mods ^= SISrcMods::NEG; + break; + case 2: + Mods ^= SISrcMods::ABS; + break; + case 3: + Mods ^= (SISrcMods::NEG | SISrcMods::ABS); + break; + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( MachineOperand &Root) const { @@ -5150,6 +5214,35 @@ AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const { + Register Src = + getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); + unsigned Key = 0; + + Register S32 = matchZeroExtendFromS32(*MRI, Src); + if (!S32) + S32 = matchAnyExtendFromS32(Src); + + if (S32) { + const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI); + if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) { + assert(Def->getNumOperands() == 3); + Register DstReg1 = Def->getOperand(1).getReg(); + if (mi_match(S32, *MRI, + m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) { + Src = Def->getOperand(2).getReg(); + Key = 1; + } + } + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key + }}; +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { Register Src; unsigned Mods; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 8e9e573147a8..2cb7904d27cc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -201,6 +201,10 @@ private: InstructionSelector::ComplexRendererFns selectVOP3PModsNeg(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3PModsNegs(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3PModsNegAbs(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; @@ -217,6 +221,8 @@ private: selectSWMMACIndex8(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectSWMMACIndex16(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSWMMACIndex32(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; @@ -411,6 +417,9 @@ private: // shift amount operand's `ShAmtBits` bits is unneeded. bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const; + /// Match an any extend from a 32-bit value to 64-bit. + Register matchAnyExtendFromS32(Register Reg) const; + const SIInstrInfo &TII; const SIRegisterInfo &TRI; const AMDGPURegisterBankInfo &RBI; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index aa678df675fb..e7bf88d2ee5b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2932,14 +2932,22 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : B.getMRI()->createGenericVirtualRegister(ConstPtrTy); - MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) - .addDef(PCReg); + if (ST.has64BitLiterals()) { + assert(GAFlags != SIInstrInfo::MO_NONE); - MIB.addGlobalAddress(GV, Offset, GAFlags); - if (GAFlags == SIInstrInfo::MO_NONE) - MIB.addImm(0); - else - MIB.addGlobalAddress(GV, Offset, GAFlags + 1); + MachineInstrBuilder MIB = + B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg); + MIB.addGlobalAddress(GV, Offset, GAFlags + 2); + } else { + MachineInstrBuilder MIB = + B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg); + + MIB.addGlobalAddress(GV, Offset, GAFlags); + if (GAFlags == SIInstrInfo::MO_NONE) + MIB.addImm(0); + else + MIB.addGlobalAddress(GV, Offset, GAFlags + 1); + } if (!B.getMRI()->getRegClassOrNull(PCReg)) B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); @@ -2955,6 +2963,15 @@ void AMDGPULegalizerInfo::buildAbsGlobalAddress( MachineRegisterInfo &MRI) const { bool RequiresHighHalf = PtrTy.getSizeInBits() != 32; + if (RequiresHighHalf && ST.has64BitLiterals()) { + if (!MRI.getRegClassOrNull(DstReg)) + MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass); + B.buildInstr(AMDGPU::S_MOV_B64) + .addDef(DstReg) + .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64); + return; + } + LLT S32 = LLT::scalar(32); // Use the destination directly, if and only if we store the lower address @@ -7622,6 +7639,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: case Intrinsic::amdgcn_image_bvh8_intersect_ray: return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B); + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: { + Register Index = MI.getOperand(5).getReg(); + LLT S64 = LLT::scalar(64); + if (MRI.getType(Index) != S64) + MI.getOperand(5).setReg(B.buildAnyExt(S64, Index).getReg(0)); + return true; + } case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: @@ -7636,15 +7667,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0)); return true; } + case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { Register Index = MI.getOperand(7).getReg(); - LLT S32 = LLT::scalar(32); - if (MRI.getType(Index) != S32) - MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0)); + LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8 + ? LLT::scalar(64) + : LLT::scalar(32); + if (MRI.getType(Index) != IdxTy) + MI.getOperand(7).setReg(B.buildAnyExt(IdxTy, Index).getReg(0)); return true; } + case Intrinsic::amdgcn_fmed3: { GISelChangeObserver &Observer = Helper.Observer; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 2dec16de940d..c84a0f6e3138 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -50,6 +50,7 @@ static AMDGPUMCExpr::Specifier getSpecifier(unsigned MOFlags) { default: return AMDGPUMCExpr::S_None; case SIInstrInfo::MO_GOTPCREL: + case SIInstrInfo::MO_GOTPCREL64: return AMDGPUMCExpr::S_GOTPCREL; case SIInstrInfo::MO_GOTPCREL32_LO: return AMDGPUMCExpr::S_GOTPCREL32_LO; @@ -59,10 +60,14 @@ static AMDGPUMCExpr::Specifier getSpecifier(unsigned MOFlags) { return AMDGPUMCExpr::S_REL32_LO; case SIInstrInfo::MO_REL32_HI: return AMDGPUMCExpr::S_REL32_HI; + case SIInstrInfo::MO_REL64: + return AMDGPUMCExpr::S_REL64; case SIInstrInfo::MO_ABS32_LO: return AMDGPUMCExpr::S_ABS32_LO; case SIInstrInfo::MO_ABS32_HI: return AMDGPUMCExpr::S_ABS32_HI; + case SIInstrInfo::MO_ABS64: + return AMDGPUMCExpr::S_ABS64; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 5d298304c27f..b6c6d927d0e8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -114,7 +114,9 @@ MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUse MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass()) +MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass()) MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass()) +MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass()) MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass()) MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass()) MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp new file mode 100644 index 000000000000..3b06e9b00ac6 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp @@ -0,0 +1,108 @@ +//===-- AMDGPUPrepareAGPRAlloc.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Make simple transformations to relax register constraints for cases which can +// allocate to AGPRs or VGPRs. Replace materialize of inline immediates into +// AGPR or VGPR with a pseudo with an AV_* class register constraint. This +// allows later passes to inflate the register class if necessary. The register +// allocator does not know to replace instructions to relax constraints. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUPrepareAGPRAlloc.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-prepare-agpr-alloc" + +namespace { + +class AMDGPUPrepareAGPRAllocImpl { +private: + const SIInstrInfo &TII; + MachineRegisterInfo &MRI; + +public: + AMDGPUPrepareAGPRAllocImpl(const GCNSubtarget &ST, MachineRegisterInfo &MRI) + : TII(*ST.getInstrInfo()), MRI(MRI) {} + bool run(MachineFunction &MF); +}; + +class AMDGPUPrepareAGPRAllocLegacy : public MachineFunctionPass { +public: + static char ID; + + AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) { + initializeAMDGPUPrepareAGPRAllocLegacyPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "AMDGPU Prepare AGPR Alloc"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE, + "AMDGPU Prepare AGPR Alloc", false, false) +INITIALIZE_PASS_END(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE, + "AMDGPU Prepare AGPR Alloc", false, false) + +char AMDGPUPrepareAGPRAllocLegacy::ID = 0; + +char &llvm::AMDGPUPrepareAGPRAllocLegacyID = AMDGPUPrepareAGPRAllocLegacy::ID; + +bool AMDGPUPrepareAGPRAllocLegacy::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + return AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF); +} + +PreservedAnalyses +AMDGPUPrepareAGPRAllocPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF); + return PreservedAnalyses::all(); +} + +bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) { + if (MRI.isReserved(AMDGPU::AGPR0)) + return false; + + const MCInstrDesc &AVImmPseudo = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO); + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if ((MI.getOpcode() == AMDGPU::V_MOV_B32_e32 && + TII.isInlineConstant(MI, 1)) || + (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + MI.getOperand(1).isImm())) { + MI.setDesc(AVImmPseudo); + Changed = true; + } + } + } + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h new file mode 100644 index 000000000000..dc598c98f241 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h @@ -0,0 +1,23 @@ +//===- AMDGPUPrepareAGPRAlloc.h ---------------------------------*- C++- *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { +class AMDGPUPrepareAGPRAllocPass + : public PassInfoMixin<AMDGPUPrepareAGPRAllocPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index 7a2a7fc250e2..f5e14c71b02d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -88,7 +88,7 @@ void AMDGPUPrintfRuntimeBindingImpl::getConversionSpecifiers( // are %p and %s, which use to know if we // are either storing a literal string or a // pointer to the printf buffer. - static const char ConvSpecifiers[] = "cdieEfgGaosuxXp"; + static const char ConvSpecifiers[] = "cdieEfFgGaAosuxXp"; size_t CurFmtSpecifierIdx = 0; size_t PrevFmtSpecifierIdx = 0; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 6a59a28b1d32..411159c8aa33 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -23,7 +23,6 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" #include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/Support/AMDGPUAddrSpace.h" #define DEBUG_TYPE "amdgpu-regbanklegalize" diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 1483d97d23fc..bf2f37bddb9e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4546,6 +4546,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_rsq_legacy: case Intrinsic::amdgcn_rsq_clamp: + case Intrinsic::amdgcn_tanh: case Intrinsic::amdgcn_fmul_legacy: case Intrinsic::amdgcn_fma_legacy: case Intrinsic::amdgcn_frexp_mant: @@ -4557,6 +4558,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_pk_u16: case Intrinsic::amdgcn_cvt_pk_f16_fp8: case Intrinsic::amdgcn_cvt_pk_f16_bf8: + case Intrinsic::amdgcn_sat_pk4_i4_i8: + case Intrinsic::amdgcn_sat_pk4_u4_u8: case Intrinsic::amdgcn_fmed3: case Intrinsic::amdgcn_cubeid: case Intrinsic::amdgcn_cubema: @@ -4688,6 +4691,44 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x4_f32: + case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16: + case Intrinsic::amdgcn_wmma_f32_16x16x32_f16: + case Intrinsic::amdgcn_wmma_f16_16x16x32_f16: + case Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16: + case Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16: + case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8: + case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8: + case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8: + case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8: + case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8: + case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8: + case Intrinsic::amdgcn_wmma_f32_32x16x128_f4: + case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 46027b889023..8101c6898624 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -167,77 +167,39 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); + Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass, + /*IncludeCalls=*/false); + if (ST.hasMAIInsts()) + Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass, + /*IncludeCalls=*/false); // If there are no calls, MachineRegisterInfo can tell us the used register // count easily. // A tail call isn't considered a call for MachineFrameInfo's purposes. if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { - Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass); - Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass); - if (ST.hasMAIInsts()) - Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass); + Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass, + /*IncludeCalls=*/false); return Info; } int32_t MaxVGPR = -1; - int32_t MaxAGPR = -1; - int32_t MaxSGPR = -1; Info.CalleeSegmentSize = 0; for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { - // TODO: Check regmasks? Do they occur anywhere except calls? - for (const MachineOperand &MO : MI.operands()) { - unsigned Width = 0; - bool IsSGPR = false; - bool IsAGPR = false; + for (unsigned I = 0; I < MI.getNumOperands(); ++I) { + const MachineOperand &MO = MI.getOperand(I); if (!MO.isReg()) continue; Register Reg = MO.getReg(); switch (Reg) { - case AMDGPU::EXEC: - case AMDGPU::EXEC_LO: - case AMDGPU::EXEC_HI: - case AMDGPU::SCC: - case AMDGPU::M0: - case AMDGPU::M0_LO16: - case AMDGPU::M0_HI16: - case AMDGPU::SRC_SHARED_BASE_LO: - case AMDGPU::SRC_SHARED_BASE: - case AMDGPU::SRC_SHARED_LIMIT_LO: - case AMDGPU::SRC_SHARED_LIMIT: - case AMDGPU::SRC_PRIVATE_BASE_LO: - case AMDGPU::SRC_PRIVATE_BASE: - case AMDGPU::SRC_PRIVATE_LIMIT_LO: - case AMDGPU::SRC_PRIVATE_LIMIT: - case AMDGPU::SRC_POPS_EXITING_WAVE_ID: - case AMDGPU::SGPR_NULL: - case AMDGPU::SGPR_NULL64: - case AMDGPU::MODE: - continue; - case AMDGPU::NoRegister: assert(MI.isDebugInstr() && "Instruction uses invalid noreg register"); continue; - case AMDGPU::VCC: - case AMDGPU::VCC_LO: - case AMDGPU::VCC_HI: - case AMDGPU::VCC_LO_LO16: - case AMDGPU::VCC_LO_HI16: - case AMDGPU::VCC_HI_LO16: - case AMDGPU::VCC_HI_HI16: - Info.UsesVCC = true; - continue; - - case AMDGPU::FLAT_SCR: - case AMDGPU::FLAT_SCR_LO: - case AMDGPU::FLAT_SCR_HI: - continue; - case AMDGPU::XNACK_MASK: case AMDGPU::XNACK_MASK_LO: case AMDGPU::XNACK_MASK_HI: @@ -267,170 +229,22 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( break; } - if (AMDGPU::SGPR_32RegClass.contains(Reg) || - AMDGPU::SGPR_LO16RegClass.contains(Reg) || - AMDGPU::SGPR_HI16RegClass.contains(Reg)) { - IsSGPR = true; - Width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || - AMDGPU::VGPR_16RegClass.contains(Reg)) { - IsSGPR = false; - Width = 1; - } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || - AMDGPU::AGPR_LO16RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 1; - } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) { - IsSGPR = true; - Width = 2; - } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { - IsSGPR = false; - Width = 2; - } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 2; - } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { - IsSGPR = false; - Width = 3; - } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { - IsSGPR = true; - Width = 3; - } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 3; - } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) { - IsSGPR = true; - Width = 4; - } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { - IsSGPR = false; - Width = 4; - } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 4; - } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { - IsSGPR = false; - Width = 5; - } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { - IsSGPR = true; - Width = 5; - } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 5; - } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { - IsSGPR = false; - Width = 6; - } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { - IsSGPR = true; - Width = 6; - } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 6; - } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { - IsSGPR = false; - Width = 7; - } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { - IsSGPR = true; - Width = 7; - } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 7; - } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { - IsSGPR = true; - Width = 8; - } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { - IsSGPR = false; - Width = 8; - } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 8; - } else if (AMDGPU::VReg_288RegClass.contains(Reg)) { - IsSGPR = false; - Width = 9; - } else if (AMDGPU::SReg_288RegClass.contains(Reg)) { - IsSGPR = true; - Width = 9; - } else if (AMDGPU::AReg_288RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 9; - } else if (AMDGPU::VReg_320RegClass.contains(Reg)) { - IsSGPR = false; - Width = 10; - } else if (AMDGPU::SReg_320RegClass.contains(Reg)) { - IsSGPR = true; - Width = 10; - } else if (AMDGPU::AReg_320RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 10; - } else if (AMDGPU::VReg_352RegClass.contains(Reg)) { - IsSGPR = false; - Width = 11; - } else if (AMDGPU::SReg_352RegClass.contains(Reg)) { - IsSGPR = true; - Width = 11; - } else if (AMDGPU::AReg_352RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 11; - } else if (AMDGPU::VReg_384RegClass.contains(Reg)) { - IsSGPR = false; - Width = 12; - } else if (AMDGPU::SReg_384RegClass.contains(Reg)) { - IsSGPR = true; - Width = 12; - } else if (AMDGPU::AReg_384RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 12; - } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { - IsSGPR = true; - Width = 16; - } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { - IsSGPR = false; - Width = 16; - } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 16; - } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { - IsSGPR = true; - Width = 32; - } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - Width = 32; - } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 32; - } else { - // We only expect TTMP registers or registers that do not belong to - // any RC. - assert((AMDGPU::TTMP_32RegClass.contains(Reg) || - AMDGPU::TTMP_64RegClass.contains(Reg) || - AMDGPU::TTMP_128RegClass.contains(Reg) || - AMDGPU::TTMP_256RegClass.contains(Reg) || - AMDGPU::TTMP_512RegClass.contains(Reg) || - !TRI.getPhysRegBaseClass(Reg)) && - "Unknown register class"); - } + const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg); + assert((!RC || TRI.isVGPRClass(RC) || TRI.isSGPRClass(RC) || + TRI.isAGPRClass(RC) || AMDGPU::TTMP_32RegClass.contains(Reg) || + AMDGPU::TTMP_64RegClass.contains(Reg) || + AMDGPU::TTMP_128RegClass.contains(Reg) || + AMDGPU::TTMP_256RegClass.contains(Reg) || + AMDGPU::TTMP_512RegClass.contains(Reg)) && + "Unknown register class"); + + if (!RC || !TRI.isVGPRClass(RC)) + continue; + + unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32); unsigned HWReg = TRI.getHWRegIndex(Reg); int MaxUsed = HWReg + Width - 1; - if (IsSGPR) { - MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; - } else if (IsAGPR) { - MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; - } else { - MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; - } + MaxVGPR = std::max(MaxUsed, MaxVGPR); } if (MI.isCall()) { @@ -492,9 +306,7 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( } } - Info.NumExplicitSGPR = MaxSGPR + 1; Info.NumVGPR = MaxVGPR + 1; - Info.NumAGPR = MaxAGPR + 1; return Info; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 1f6002a3c6a2..dfe0cbf18c47 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -341,6 +341,10 @@ foreach intr = AMDGPUWMMAIntrinsicsGFX11 in def : SourceOfDivergence<intr>; foreach intr = AMDGPUWMMAIntrinsicsGFX12 in def : SourceOfDivergence<intr>; +foreach intr = AMDGPUWMMAIntrinsicsGFX1250 in +def : SourceOfDivergence<intr>; +foreach intr = AMDGPUSWMMACIntrinsicsGFX1250 in +def : SourceOfDivergence<intr>; def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>; def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 7c24f428d78e..1e44be8e4720 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -59,6 +59,7 @@ protected: bool HasCvtPkF16F32Inst = false; bool HasF32ToF16BF16ConversionSRInsts = false; bool EnableRealTrue16Insts = false; + bool HasBF16TransInsts = false; bool HasBF16ConversionInsts = false; bool HasMadMixInsts = false; bool HasMadMacF32Insts = false; @@ -202,6 +203,8 @@ public: // supported and the support for fake True16 instructions is removed. bool useRealTrue16Insts() const; + bool hasBF16TransInsts() const { return HasBF16TransInsts; } + bool hasBF16ConversionInsts() const { return HasBF16ConversionInsts; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index f4dc4a483181..c865082a1dce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -25,6 +25,7 @@ #include "AMDGPUMacroFusion.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUPreloadKernArgProlog.h" +#include "AMDGPUPrepareAGPRAlloc.h" #include "AMDGPURemoveIncompatibleFunctions.h" #include "AMDGPUReserveWWMRegs.h" #include "AMDGPUResourceUsageAnalysis.h" @@ -499,6 +500,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeGlobalISel(*PR); initializeAMDGPUAsmPrinterPass(*PR); initializeAMDGPUDAGToDAGISelLegacyPass(*PR); + initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR); initializeGCNDPPCombineLegacyPass(*PR); initializeSILowerI1CopiesLegacyPass(*PR); initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR); @@ -1196,6 +1198,7 @@ public: bool addRegBankSelect() override; void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; + void addPreRegAlloc() override; void addFastRegAlloc() override; void addOptimizedRegAlloc() override; @@ -1539,6 +1542,11 @@ void GCNPassConfig::addFastRegAlloc() { TargetPassConfig::addFastRegAlloc(); } +void GCNPassConfig::addPreRegAlloc() { + if (getOptLevel() != CodeGenOptLevel::None) + addPass(&AMDGPUPrepareAGPRAllocLegacyID); +} + void GCNPassConfig::addOptimizedRegAlloc() { if (EnableDCEInRA) insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); @@ -2235,6 +2243,11 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc( Base::addOptimizedRegAlloc(addPass); } +void AMDGPUCodeGenPassBuilder::addPreRegAlloc(AddMachinePass &addPass) const { + if (getOptLevel() != CodeGenOptLevel::None) + addPass(AMDGPUPrepareAGPRAllocPass()); +} + Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized( AddMachinePass &addPass) const { // TODO: Check --regalloc-npm option @@ -2284,6 +2297,12 @@ void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const { Base::addPostRegAlloc(addPass); } +void AMDGPUCodeGenPassBuilder::addPreSched2(AddMachinePass &addPass) const { + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(SIShrinkInstructionsPass()); + addPass(SIPostRABundlerPass()); +} + void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) { addPass(GCNCreateVOPDPass()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 3c62cd19c6e5..e0f1296ddded 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -181,8 +181,11 @@ public: void addMachineSSAOptimization(AddMachinePass &) const; void addPostRegAlloc(AddMachinePass &) const; void addPreEmitPass(AddMachinePass &) const; + void addPreEmitRegAlloc(AddMachinePass &) const; Error addRegAssignmentOptimized(AddMachinePass &) const; + void addPreRegAlloc(AddMachinePass &) const; void addOptimizedRegAlloc(AddMachinePass &) const; + void addPreSched2(AddMachinePass &) const; /// Check if a pass is enabled given \p Opt option. The option always /// overrides defaults if explicitly used. Otherwise its default will be used diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 6439230b8769..43d4e8db791b 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -157,6 +157,7 @@ public: ImmTyNegHi, ImmTyIndexKey8bit, ImmTyIndexKey16bit, + ImmTyIndexKey32bit, ImmTyDPP8, ImmTyDppCtrl, ImmTyDppRowMask, @@ -174,8 +175,10 @@ public: ImmTyWaitEXP, ImmTyWaitVAVDst, ImmTyWaitVMVSrc, - ImmTyByteSel, ImmTyBitOp3, + ImmTyMatrixAReuse, + ImmTyMatrixBReuse, + ImmTyByteSel, }; // Immediate operand kind. @@ -419,6 +422,9 @@ public: bool isCPol() const { return isImmTy(ImmTyCPol); } bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); } bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); } + bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); } + bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); } + bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); } bool isDppFI() const { return isImmTy(ImmTyDppFI); } @@ -747,6 +753,10 @@ public: return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f64); } + bool isVISrc_512_f64() const { + return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f64); + } + bool isVISrc_128B16() const { return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i16); } @@ -1116,6 +1126,7 @@ public: case ImmTyCPol: OS << "CPol"; break; case ImmTyIndexKey8bit: OS << "index_key"; break; case ImmTyIndexKey16bit: OS << "index_key"; break; + case ImmTyIndexKey32bit: OS << "index_key"; break; case ImmTyTFE: OS << "TFE"; break; case ImmTyD16: OS << "D16"; break; case ImmTyFORMAT: OS << "FORMAT"; break; @@ -1162,8 +1173,10 @@ public: case ImmTyWaitEXP: OS << "WaitEXP"; break; case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break; case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break; - case ImmTyByteSel: OS << "ByteSel" ; break; case ImmTyBitOp3: OS << "BitOp3"; break; + case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break; + case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break; + case ImmTyByteSel: OS << "ByteSel" ; break; } // clang-format on } @@ -1700,6 +1713,7 @@ public: AMDGPUOperand::ImmTy ImmTy); ParseStatus parseIndexKey8bit(OperandVector &Operands); ParseStatus parseIndexKey16bit(OperandVector &Operands); + ParseStatus parseIndexKey32bit(OperandVector &Operands); ParseStatus parseDfmtNfmt(int64_t &Format); ParseStatus parseUfmt(int64_t &Format); @@ -3981,8 +3995,8 @@ bool AMDGPUAsmParser::validateVOPD(const MCInst &Inst, bool AsVOPD3 = MII.get(Opcode).TSFlags & SIInstrFlags::VOPD3; if (AsVOPD3) { - for (unsigned I = 0, E = Operands.size(); I != E; ++I) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + for (const std::unique_ptr<MCParsedAsmOperand> &Operand : Operands) { + AMDGPUOperand &Op = (AMDGPUOperand &)*Operand; if ((Op.isRegKind() || Op.isImmTy(AMDGPUOperand::ImmTyNone)) && (Op.getModifiers().getFPModifiersOperand() & SISrcMods::ABS)) Error(Op.getStartLoc(), "ABS not allowed in VOPD3 instructions"); @@ -7153,7 +7167,9 @@ ParseStatus AMDGPUAsmParser::tryParseIndexKey(OperandVector &Operands, if (!Res.isSuccess()) return Res; - if (ImmTy == AMDGPUOperand::ImmTyIndexKey16bit && (ImmVal < 0 || ImmVal > 1)) + if ((ImmTy == AMDGPUOperand::ImmTyIndexKey16bit || + ImmTy == AMDGPUOperand::ImmTyIndexKey32bit) && + (ImmVal < 0 || ImmVal > 1)) return Error(Loc, Twine("out of range ", StringRef(Pref))); if (ImmTy == AMDGPUOperand::ImmTyIndexKey8bit && (ImmVal < 0 || ImmVal > 3)) @@ -7171,6 +7187,10 @@ ParseStatus AMDGPUAsmParser::parseIndexKey16bit(OperandVector &Operands) { return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey16bit); } +ParseStatus AMDGPUAsmParser::parseIndexKey32bit(OperandVector &Operands) { + return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey32bit); +} + // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their // values to live in a joint format operand in the MCInst encoding. ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { @@ -9272,6 +9292,14 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, DefaultVal); } + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse)) + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixAReuse, 0); + + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_b_reuse)) + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixBReuse, 0); + int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo); if (NegLoIdx != -1) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo); @@ -9378,6 +9406,10 @@ void AMDGPUAsmParser::cvtSWMMAC(MCInst &Inst, const OperandVector &Operands) { addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyIndexKey16bit); + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_32bit)) + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyIndexKey32bit); + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp)) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClamp); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index e3519f192137..42edec0d0149 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -74,6 +74,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPULowerKernelArguments.cpp AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp + AMDGPUPrepareAGPRAlloc.cpp AMDGPUSwLowerLDS.cpp AMDGPUMachineFunction.cpp AMDGPUMachineModuleInfo.cpp diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 3625db9a4791..c8a4e22ed1da 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -200,6 +200,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : let Inst{95-72} = !if(ps.has_offset, offset, ?); } +// TODO: Rename to FlatSaddrTable, it now handles both global and flat GVS addressing mode. class GlobalSaddrTable <bit is_saddr, string Name = ""> { bit IsSaddr = is_saddr; string SaddrOp = Name; @@ -237,10 +238,18 @@ class FLAT_Load_Pseudo< let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); } -multiclass FLAT_Load_Pseudo_t16<string opName> { - def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>; +multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { + def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput>, + GlobalSaddrTable<0, opName>; + let OtherPredicates = [HasFlatGVSMode] in + def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>, + GlobalSaddrTable<1, opName>; +} + +multiclass FLAT_Flat_Load_Pseudo_t16<string opName> { + defm "" : FLAT_Flat_Load_Pseudo<opName, VGPR_32, 1>; let True16Predicate = UseRealTrue16Insts in - def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>; + defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>; } class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, @@ -260,10 +269,26 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, let enabled_saddr = EnableSaddr; } -multiclass FLAT_Store_Pseudo_t16<string opName> { - def "" : FLAT_Store_Pseudo<opName, VGPR_32>; - let OtherPredicates = [HasTrue16BitInsts] in - def _t16 : FLAT_Store_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>; +multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> { + def "" : FLAT_Store_Pseudo<opName, regClass>, + GlobalSaddrTable<0, opName>; + let OtherPredicates = [HasFlatGVSMode] in + def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>, + GlobalSaddrTable<1, opName>; +} + +multiclass FLAT_Flat_Store_Pseudo_t16<string opName> { + defm "" : FLAT_Flat_Store_Pseudo<opName, VGPR_32>; + + defvar Name16 = opName#"_t16"; + let OtherPredicates = [HasFlatGVSMode, HasTrue16BitInsts] in { + def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>, + GlobalSaddrTable<0, Name16>, + True16D16Table<NAME#"_D16_HI", NAME>; + def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>, + GlobalSaddrTable<1, Name16>, + True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">; + } } multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { @@ -657,6 +682,18 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN< let FPAtomic = data_vt.isFP; let AddedComplexity = -1; // Prefer global atomics if available } + + def _SADDR : FLAT_AtomicNoRet_Pseudo <opName, + (outs), + (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol), + " $vaddr, $vdata, $saddr$offset$cpol">, + GlobalSaddrTable<1, opName> { + let OtherPredicates = [HasFlatGVSMode]; + let has_saddr = 1; + let enabled_saddr = 1; + let FPAtomic = data_vt.isFP; + let AddedComplexity = -1; // Prefer global atomics if available + } } multiclass FLAT_Atomic_Pseudo_RTN< @@ -665,15 +702,29 @@ multiclass FLAT_Atomic_Pseudo_RTN< ValueType vt, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { + RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret, + RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> { def _RTN : FLAT_AtomicRet_Pseudo <opName, - (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst), + (outs vdst_op:$vdst), (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn"> { let FPAtomic = data_vt.isFP; let AddedComplexity = -1; // Prefer global atomics if available } + + def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName, + (outs vdst_op:$vdst), + (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol), + " $vdst, $vaddr, $vdata, $saddr$offset$cpol">, + GlobalSaddrTable<1, opName#"_rtn"> { + let OtherPredicates = [HasFlatGVSMode]; + let has_saddr = 1; + let enabled_saddr = 1; + let PseudoInstr = NAME#"_SADDR_RTN"; + let FPAtomic = data_vt.isFP; + let AddedComplexity = -1; // Prefer global atomics if available + } } multiclass FLAT_Atomic_Pseudo< @@ -762,36 +813,36 @@ multiclass FLAT_Global_Atomic_Pseudo< // Flat Instructions //===----------------------------------------------------------------------===// -def FLAT_LOAD_UBYTE : FLAT_Load_Pseudo <"flat_load_ubyte", VGPR_32>; -def FLAT_LOAD_SBYTE : FLAT_Load_Pseudo <"flat_load_sbyte", VGPR_32>; -def FLAT_LOAD_USHORT : FLAT_Load_Pseudo <"flat_load_ushort", VGPR_32>; -def FLAT_LOAD_SSHORT : FLAT_Load_Pseudo <"flat_load_sshort", VGPR_32>; -def FLAT_LOAD_DWORD : FLAT_Load_Pseudo <"flat_load_dword", VGPR_32>; -def FLAT_LOAD_DWORDX2 : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>; -def FLAT_LOAD_DWORDX4 : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>; -def FLAT_LOAD_DWORDX3 : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>; +defm FLAT_LOAD_UBYTE : FLAT_Flat_Load_Pseudo <"flat_load_ubyte", VGPR_32>; +defm FLAT_LOAD_SBYTE : FLAT_Flat_Load_Pseudo <"flat_load_sbyte", VGPR_32>; +defm FLAT_LOAD_USHORT : FLAT_Flat_Load_Pseudo <"flat_load_ushort", VGPR_32>; +defm FLAT_LOAD_SSHORT : FLAT_Flat_Load_Pseudo <"flat_load_sshort", VGPR_32>; +defm FLAT_LOAD_DWORD : FLAT_Flat_Load_Pseudo <"flat_load_dword", VGPR_32>; +defm FLAT_LOAD_DWORDX2 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", VReg_64>; +defm FLAT_LOAD_DWORDX4 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", VReg_128>; +defm FLAT_LOAD_DWORDX3 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", VReg_96>; -def FLAT_STORE_DWORD : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>; -def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>; -def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>; -def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>; +defm FLAT_STORE_DWORD : FLAT_Flat_Store_Pseudo <"flat_store_dword", VGPR_32>; +defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", VReg_64>; +defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", VReg_128>; +defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", VReg_96>; let SubtargetPredicate = HasD16LoadStore in { let TiedSourceNotRead = 1 in { -def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">; -def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">; -def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">; +defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_UBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_ubyte_d16">; +defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_SBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_sbyte_d16">; +defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_SHORT_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_short_d16">; } -def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; -def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; +defm FLAT_STORE_BYTE_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; +defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; } -defm FLAT_STORE_BYTE : FLAT_Store_Pseudo_t16 <"flat_store_byte">; -defm FLAT_STORE_SHORT : FLAT_Store_Pseudo_t16 <"flat_store_short">; +defm FLAT_STORE_BYTE : FLAT_Flat_Store_Pseudo_t16 <"flat_store_byte">; +defm FLAT_STORE_SHORT : FLAT_Flat_Store_Pseudo_t16 <"flat_store_short">; defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap", VGPR_32, i32, v2i32, VReg_64>; @@ -1200,6 +1251,16 @@ class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp (inst $saddr, $voffset, $offset, 0, $in) >; +class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)), + (inst $saddr, $voffset, $offset, (i32 0), $in) +>; + +class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), + (inst $saddr, $voffset, $offset, (i32 0)) +>; + class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), (inst $saddr, $voffset, $offset, (i32 0)) @@ -1210,13 +1271,13 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> (inst $vaddr, $offset) >; -class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < +class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), (inst $saddr, $voffset, $offset, 0) >; -class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, - ValueType vt> : GCNPat < +class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, + ValueType vt> : GCNPat < (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)), (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset) >; @@ -1394,7 +1455,7 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp let AddedComplexity = 10; } - def : GlobalLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1404,7 +1465,7 @@ multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, Valu let AddedComplexity = 10; } - def : GlobalLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1425,7 +1486,7 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, let AddedComplexity = 10; } - def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1435,7 +1496,7 @@ multiclass GlobalFLATStorePats_D16_t16<string inst, SDPatternOperator node, Valu let AddedComplexity = 10; } - def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> { + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> { let AddedComplexity = 11; } } @@ -1568,80 +1629,129 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu } } +multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat <inst, node, vt>; + + def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat_D16 <inst, node, vt>; + + def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat_D16_t16 <inst, node, vt>; + + def : FlatLoadSaddrPat_D16_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatStorePat <inst, node, vt>; + + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatStorePats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatStorePat <!cast<FLAT_Pseudo>(!cast<string>(inst)#"_t16"), node, vt>; + + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR_t16"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + let OtherPredicates = [HasFlatAddressSpace] in { -def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i16>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, extloadi16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, zextloadi16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX3, load_flat, v3i32>; foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in let True16Predicate = p in { - def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>; - def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>; - def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>; - def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; - def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_USHORT, load_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>; + defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>; + defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>; + defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; + defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; } let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in { - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>; - def : FlatStorePat <FLAT_STORE_BYTE_t16, truncstorei8_flat, i16>; - def : FlatStorePat <FLAT_STORE_SHORT_t16, store_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>; + defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>; + defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>; def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>; def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>; } // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts -def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>; +defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>; -def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>; +defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>; foreach vt = Reg32Types.types in { -def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>; -def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>; +defm : FlatLoadPats <FLAT_LOAD_DWORD, load_flat, vt>; +defm : FlatStorePats <FLAT_STORE_DWORD, store_flat, vt>; } foreach vt = VReg_64.RegTypes in { -def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>; +defm : FlatStorePats <FLAT_STORE_DWORDX2, store_flat, vt>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX2, load_flat, vt>; } -def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32>; +defm : FlatStorePats <FLAT_STORE_DWORDX3, store_flat, v3i32>; foreach vt = VReg_128.RegTypes in { -def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>; -def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX4, load_flat, vt>; +defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>; } -def : FlatStorePat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>; -def : FlatStorePat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>; -def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>; +defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>; +defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>; +defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>; +defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>; + foreach as = [ "flat", "global" ] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>; @@ -1684,6 +1794,9 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>; } // end foreach as +defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>; +defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>; + let SubtargetPredicate = isGFX12Plus in { defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >; @@ -1692,25 +1805,25 @@ let SubtargetPredicate = isGFX12Plus in { } let OtherPredicates = [HasD16LoadStore] in { -def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; -def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; } let OtherPredicates = [D16PreservesUnusedBits] in { // TODO: Handle atomic loads -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; } } // End OtherPredicates = [HasFlatAddressSpace] @@ -1782,6 +1895,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>; // appropriate waits. defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>; +defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>; @@ -1821,6 +1935,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>; +defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>; @@ -2832,14 +2947,7 @@ multiclass VFLAT_Real_Base_gfx12<bits<8> op, VFLAT_Aliases_gfx12<name, alias>, VFLAT_Real_gfx12<op, name>; -multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, - string name = get_FLAT_ps<NAME>.Mnemonic, - string alias = name> : - VFLAT_Real_Base_gfx12<op, name, alias> { - defm _RTN : VFLAT_Real_gfx12<op, name>; -} - -multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op, +multiclass VFLAT_Real_AllAddr_gfx12<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic, string alias = name> : VFLAT_Real_Base_gfx12<op, name, alias> { @@ -2853,7 +2961,7 @@ multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> { } } -multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op, +multiclass VFLAT_Real_AllAddr_gfx12_w64<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> : VFLAT_Aliases_gfx12<name> { let DecoderNamespace = "GFX12W64" in { @@ -2862,10 +2970,10 @@ multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op, } } -multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op, +multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic, string alias = name> : - VGLOBAL_Real_AllAddr_gfx12<op, name, alias> { + VFLAT_Real_AllAddr_gfx12<op, name, alias> { defm _RTN : VFLAT_Real_gfx12<op, name>; defm _SADDR_RTN : VFLAT_Real_gfx12<op, name>; } @@ -2879,28 +2987,28 @@ multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op, } // ENC_VFLAT. -defm FLAT_LOAD_UBYTE : VFLAT_Real_Base_gfx12<0x010, "flat_load_u8">; -defm FLAT_LOAD_SBYTE : VFLAT_Real_Base_gfx12<0x011, "flat_load_i8">; -defm FLAT_LOAD_USHORT : VFLAT_Real_Base_gfx12<0x012, "flat_load_u16">; -defm FLAT_LOAD_SSHORT : VFLAT_Real_Base_gfx12<0x013, "flat_load_i16">; -defm FLAT_LOAD_DWORD : VFLAT_Real_Base_gfx12<0x014, "flat_load_b32">; -defm FLAT_LOAD_DWORDX2 : VFLAT_Real_Base_gfx12<0x015, "flat_load_b64">; -defm FLAT_LOAD_DWORDX3 : VFLAT_Real_Base_gfx12<0x016, "flat_load_b96">; -defm FLAT_LOAD_DWORDX4 : VFLAT_Real_Base_gfx12<0x017, "flat_load_b128">; -defm FLAT_STORE_BYTE : VFLAT_Real_Base_gfx12<0x018, "flat_store_b8">; -defm FLAT_STORE_SHORT : VFLAT_Real_Base_gfx12<0x019, "flat_store_b16">; -defm FLAT_STORE_DWORD : VFLAT_Real_Base_gfx12<0x01a, "flat_store_b32">; -defm FLAT_STORE_DWORDX2 : VFLAT_Real_Base_gfx12<0x01b, "flat_store_b64">; -defm FLAT_STORE_DWORDX3 : VFLAT_Real_Base_gfx12<0x01c, "flat_store_b96">; -defm FLAT_STORE_DWORDX4 : VFLAT_Real_Base_gfx12<0x01d, "flat_store_b128">; -defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_Base_gfx12<0x01e, "flat_load_d16_u8">; -defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_Base_gfx12<0x01f, "flat_load_d16_i8">; -defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_Base_gfx12<0x020, "flat_load_d16_b16">; -defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x021, "flat_load_d16_hi_u8">; -defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x022, "flat_load_d16_hi_i8">; -defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x023, "flat_load_d16_hi_b16">; -defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_Base_gfx12<0x024, "flat_store_d16_hi_b8">; -defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x025, "flat_store_d16_hi_b16">; +defm FLAT_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "flat_load_u8">; +defm FLAT_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "flat_load_i8">; +defm FLAT_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "flat_load_u16">; +defm FLAT_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "flat_load_i16">; +defm FLAT_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "flat_load_b32">; +defm FLAT_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "flat_load_b64">; +defm FLAT_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "flat_load_b96">; +defm FLAT_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "flat_load_b128">; +defm FLAT_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "flat_store_b8">; +defm FLAT_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "flat_store_b16">; +defm FLAT_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "flat_store_b32">; +defm FLAT_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "flat_store_b64">; +defm FLAT_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "flat_store_b96">; +defm FLAT_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "flat_store_b128">; +defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "flat_load_d16_u8">; +defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "flat_load_d16_i8">; +defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "flat_load_d16_b16">; +defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "flat_load_d16_hi_u8">; +defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "flat_load_d16_hi_i8">; +defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "flat_load_d16_hi_b16">; +defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "flat_store_d16_hi_b8">; +defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "flat_store_d16_hi_b16">; defm FLAT_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "flat_atomic_swap_b32">; defm FLAT_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "flat_atomic_cmpswap_b32">; defm FLAT_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "flat_atomic_add_u32">; @@ -2936,74 +3044,74 @@ defm FLAT_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>; defm FLAT_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>; // ENC_VGLOBAL. -defm GLOBAL_LOAD_UBYTE : VGLOBAL_Real_AllAddr_gfx12<0x010, "global_load_u8">; -defm GLOBAL_LOAD_SBYTE : VGLOBAL_Real_AllAddr_gfx12<0x011, "global_load_i8">; -defm GLOBAL_LOAD_USHORT : VGLOBAL_Real_AllAddr_gfx12<0x012, "global_load_u16">; -defm GLOBAL_LOAD_SSHORT : VGLOBAL_Real_AllAddr_gfx12<0x013, "global_load_i16">; -defm GLOBAL_LOAD_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x014, "global_load_b32">; -defm GLOBAL_LOAD_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x015, "global_load_b64">; -defm GLOBAL_LOAD_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x016, "global_load_b96">; -defm GLOBAL_LOAD_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x017, "global_load_b128">; -defm GLOBAL_STORE_BYTE : VGLOBAL_Real_AllAddr_gfx12<0x018, "global_store_b8">; -defm GLOBAL_STORE_SHORT : VGLOBAL_Real_AllAddr_gfx12<0x019, "global_store_b16">; -defm GLOBAL_STORE_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x01a, "global_store_b32">; -defm GLOBAL_STORE_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x01b, "global_store_b64">; -defm GLOBAL_STORE_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x01c, "global_store_b96">; -defm GLOBAL_STORE_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x01d, "global_store_b128">; -defm GLOBAL_LOAD_UBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">; -defm GLOBAL_LOAD_SBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">; -defm GLOBAL_LOAD_SHORT_D16 : VGLOBAL_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">; -defm GLOBAL_LOAD_UBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">; -defm GLOBAL_LOAD_SBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">; -defm GLOBAL_LOAD_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">; -defm GLOBAL_STORE_BYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">; -defm GLOBAL_STORE_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">; -defm GLOBAL_LOAD_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">; -defm GLOBAL_STORE_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">; -defm GLOBAL_LOAD_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x053>; -defm GLOBAL_STORE_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x054>; - -defm GLOBAL_ATOMIC_SWAP : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">; -defm GLOBAL_ATOMIC_CMPSWAP : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">; -defm GLOBAL_ATOMIC_ADD : VGLOBAL_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">; -defm GLOBAL_ATOMIC_SUB : VGLOBAL_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">; -defm GLOBAL_ATOMIC_CSUB : VGLOBAL_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">; -defm GLOBAL_ATOMIC_SMIN : VGLOBAL_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">; -defm GLOBAL_ATOMIC_UMIN : VGLOBAL_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">; -defm GLOBAL_ATOMIC_SMAX : VGLOBAL_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">; -defm GLOBAL_ATOMIC_UMAX : VGLOBAL_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">; -defm GLOBAL_ATOMIC_AND : VGLOBAL_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">; -defm GLOBAL_ATOMIC_OR : VGLOBAL_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">; -defm GLOBAL_ATOMIC_XOR : VGLOBAL_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">; -defm GLOBAL_ATOMIC_INC : VGLOBAL_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">; -defm GLOBAL_ATOMIC_DEC : VGLOBAL_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">; -defm GLOBAL_ATOMIC_SWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">; -defm GLOBAL_ATOMIC_CMPSWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">; -defm GLOBAL_ATOMIC_ADD_X2 : VGLOBAL_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">; -defm GLOBAL_ATOMIC_SUB_X2 : VGLOBAL_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">; -defm GLOBAL_ATOMIC_SMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">; -defm GLOBAL_ATOMIC_UMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">; -defm GLOBAL_ATOMIC_SMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">; -defm GLOBAL_ATOMIC_UMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">; -defm GLOBAL_ATOMIC_AND_X2 : VGLOBAL_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">; -defm GLOBAL_ATOMIC_OR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">; -defm GLOBAL_ATOMIC_XOR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">; -defm GLOBAL_ATOMIC_INC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">; -defm GLOBAL_ATOMIC_DEC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">; -defm GLOBAL_ATOMIC_COND_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x050>; -defm GLOBAL_ATOMIC_FMIN : VGLOBAL_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">; -defm GLOBAL_ATOMIC_FMAX : VGLOBAL_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">; -defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056>; +defm GLOBAL_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "global_load_u8">; +defm GLOBAL_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "global_load_i8">; +defm GLOBAL_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "global_load_u16">; +defm GLOBAL_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "global_load_i16">; +defm GLOBAL_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "global_load_b32">; +defm GLOBAL_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "global_load_b64">; +defm GLOBAL_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "global_load_b96">; +defm GLOBAL_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "global_load_b128">; +defm GLOBAL_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "global_store_b8">; +defm GLOBAL_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "global_store_b16">; +defm GLOBAL_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "global_store_b32">; +defm GLOBAL_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "global_store_b64">; +defm GLOBAL_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "global_store_b96">; +defm GLOBAL_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "global_store_b128">; +defm GLOBAL_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">; +defm GLOBAL_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">; +defm GLOBAL_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">; +defm GLOBAL_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">; +defm GLOBAL_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">; +defm GLOBAL_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">; +defm GLOBAL_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">; +defm GLOBAL_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">; +defm GLOBAL_LOAD_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">; +defm GLOBAL_STORE_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">; +defm GLOBAL_LOAD_BLOCK : VFLAT_Real_AllAddr_gfx12<0x053>; +defm GLOBAL_STORE_BLOCK : VFLAT_Real_AllAddr_gfx12<0x054>; + +defm GLOBAL_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">; +defm GLOBAL_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">; +defm GLOBAL_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">; +defm GLOBAL_ATOMIC_SUB : VFLAT_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">; +defm GLOBAL_ATOMIC_CSUB : VFLAT_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">; +defm GLOBAL_ATOMIC_SMIN : VFLAT_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">; +defm GLOBAL_ATOMIC_UMIN : VFLAT_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">; +defm GLOBAL_ATOMIC_SMAX : VFLAT_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">; +defm GLOBAL_ATOMIC_UMAX : VFLAT_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">; +defm GLOBAL_ATOMIC_AND : VFLAT_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">; +defm GLOBAL_ATOMIC_OR : VFLAT_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">; +defm GLOBAL_ATOMIC_XOR : VFLAT_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">; +defm GLOBAL_ATOMIC_INC : VFLAT_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">; +defm GLOBAL_ATOMIC_DEC : VFLAT_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">; +defm GLOBAL_ATOMIC_SWAP_X2 : VFLAT_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">; +defm GLOBAL_ATOMIC_CMPSWAP_X2 : VFLAT_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">; +defm GLOBAL_ATOMIC_ADD_X2 : VFLAT_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">; +defm GLOBAL_ATOMIC_SUB_X2 : VFLAT_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">; +defm GLOBAL_ATOMIC_SMIN_X2 : VFLAT_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">; +defm GLOBAL_ATOMIC_UMIN_X2 : VFLAT_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">; +defm GLOBAL_ATOMIC_SMAX_X2 : VFLAT_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">; +defm GLOBAL_ATOMIC_UMAX_X2 : VFLAT_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">; +defm GLOBAL_ATOMIC_AND_X2 : VFLAT_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">; +defm GLOBAL_ATOMIC_OR_X2 : VFLAT_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">; +defm GLOBAL_ATOMIC_XOR_X2 : VFLAT_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">; +defm GLOBAL_ATOMIC_INC_X2 : VFLAT_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">; +defm GLOBAL_ATOMIC_DEC_X2 : VFLAT_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">; +defm GLOBAL_ATOMIC_COND_SUB_U32 : VFLAT_Real_Atomics_gfx12<0x050>; +defm GLOBAL_ATOMIC_FMIN : VFLAT_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">; +defm GLOBAL_ATOMIC_FMAX : VFLAT_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">; +defm GLOBAL_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056>; defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x057>; defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x058>; -defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x057>; -defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x058>; +defm GLOBAL_LOAD_TR_B128_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x057>; +defm GLOBAL_LOAD_TR_B64_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x058>; -defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>; -defm GLOBAL_ATOMIC_PK_ADD_F16 : VGLOBAL_Real_Atomics_gfx12<0x059>; -defm GLOBAL_ATOMIC_PK_ADD_BF16 : VGLOBAL_Real_Atomics_gfx12<0x05a>; +defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VFLAT_Real_Atomics_gfx12<0x073>; +defm GLOBAL_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>; +defm GLOBAL_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>; defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b>; defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c>; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 0976fccf78d8..bbed828b4fed 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1189,6 +1189,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { } fixVALUPartialForwardingHazard(MI); fixVALUTransUseHazard(MI); + fixVALUTransCoexecutionHazards(MI); fixWMMAHazards(MI); fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); @@ -1809,6 +1810,51 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { return true; } +bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) { + if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled. + !SIInstrInfo::isVALU(*MI) || SIInstrInfo::isTRANS(*MI)) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) { + if (!SIInstrInfo::isTRANS(I)) + return false; + + // RAW: Trans(I) writes, VALU(MI) reads. + Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); + for (const MachineOperand &ValuUse : MI->explicit_uses()) { + if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg())) + return true; + } + + auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst); + if (!ValuDst || !ValuDst->isReg()) + return false; + + // WAR: Trans(I) reads, VALU(MI) writes. + Register ValuDef = ValuDst->getReg(); + for (const MachineOperand &TransUse : I.explicit_uses()) { + if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg())) + return true; + } + + return false; + }; + + auto IsExpiredFn = [](const MachineInstr &I, int) { + return SIInstrInfo::isVALU(I); + }; + + const int HasVALU = std::numeric_limits<int>::max(); + if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); + return true; +} + bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) return false; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index bbc55851bf96..ef6ddd874f58 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -104,6 +104,7 @@ private: bool fixLdsDirectVMEMHazard(MachineInstr *MI); bool fixVALUPartialForwardingHazard(MachineInstr *MI); bool fixVALUTransUseHazard(MachineInstr *MI); + bool fixVALUTransCoexecutionHazards(MachineInstr *MI); bool fixWMMAHazards(MachineInstr *MI); bool fixShift64HighRegBug(MachineInstr *MI); bool fixVALUMaskWriteHazard(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index fce8f36d4596..a6553083d722 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -803,7 +803,8 @@ void GCNScheduleDAGMILive::schedule() { GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { GCNDownwardRPTracker RPTracker(*LIS); - RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); + RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second, + &LiveIns[RegionIdx]); return RPTracker.moveMaxPressure(); } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index e6dd98a10420..268162bcada4 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -214,6 +214,7 @@ protected: bool FlatInstOffsets = false; bool FlatGlobalInsts = false; bool FlatScratchInsts = false; + bool FlatGVSMode = false; bool ScalarFlatScratchInsts = false; bool HasArchitectedFlatScratch = false; bool EnableFlatScratch = false; @@ -233,6 +234,7 @@ protected: bool HasRestrictedSOffset = false; bool Has64BitLiterals = false; bool HasBitOp3Insts = false; + bool HasTanhInsts = false; bool HasTransposeLoadF4F6Insts = false; bool HasPrngInst = false; bool HasBVHDualAndBVH8Insts = false; @@ -1156,10 +1158,12 @@ public: bool hasMadF16() const; - bool hasMovB64() const { return GFX940Insts; } + bool hasMovB64() const { return GFX940Insts || GFX1250Insts; } bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; } + bool hasFlatGVSMode() const { return FlatGVSMode; } + bool enableSIScheduler() const { return EnableSIScheduler; } @@ -1377,6 +1381,10 @@ public: return HasMinimum3Maximum3F16; } + bool hasTanhInsts() const { return HasTanhInsts; } + + bool hasAddPC64Inst() const { return GFX1250Insts; } + bool hasMinimum3Maximum3PKF16() const { return HasMinimum3Maximum3PKF16; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index e7d0e1838fa6..2a920f6feb1c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -108,7 +108,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, MCContext *Ctx) { int64_t SignedValue = static_cast<int64_t>(Value); - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { case AMDGPU::fixup_si_sopp_br: { int64_t BrImm = (SignedValue - 4) / 4; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 22ae5f4e7191..0d5a8be6220d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -64,6 +64,8 @@ unsigned AMDGPUELFObjectWriter::getRelocType(const MCFixup &Fixup, return ELF::R_AMDGPU_ABS32_LO; case AMDGPUMCExpr::S_ABS32_HI: return ELF::R_AMDGPU_ABS32_HI; + case AMDGPUMCExpr::S_ABS64: + return ELF::R_AMDGPU_ABS64; } MCFixupKind Kind = Fixup.getKind(); @@ -76,7 +78,7 @@ unsigned AMDGPUELFObjectWriter::getRelocType(const MCFixup &Fixup, return IsPCRel ? ELF::R_AMDGPU_REL64 : ELF::R_AMDGPU_ABS64; } - if (Fixup.getTargetKind() == AMDGPU::fixup_si_sopp_br) { + if (Fixup.getKind() == AMDGPU::fixup_si_sopp_br) { const auto *SymA = Target.getAddSym(); assert(SymA); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index cb6319ed627c..ec9248b972ec 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1332,6 +1332,16 @@ void AMDGPUInstPrinter::printIndexKey16bit(const MCInst *MI, unsigned OpNo, O << " index_key:" << Imm; } +void AMDGPUInstPrinter::printIndexKey32bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + auto Imm = MI->getOperand(OpNo).getImm() & 0x7; + if (Imm == 0) + return; + + O << " index_key:" << Imm; +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index fb803b1f8134..e3299a618e88 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -132,6 +132,8 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printIndexKey16bit(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printIndexKey32bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printInterpSlot(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printInterpAttr(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 31dd373e54fb..ffdac8b8ce32 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -25,6 +25,7 @@ const MCAsmInfo::AtSpecifier atSpecifiers[] = { {AMDGPUMCExpr::S_REL64, "rel64"}, {AMDGPUMCExpr::S_ABS32_LO, "abs32@lo"}, {AMDGPUMCExpr::S_ABS32_HI, "abs32@hi"}, + {AMDGPUMCExpr::S_ABS64, "abs64"}, }; AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index 4bb3942936f0..f48739fe0181 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -381,9 +381,11 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, // Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions. // Note that accvgpr_read/write are MAI, have src0, but do not use op_sel. - if ((Desc.TSFlags & SIInstrFlags::VOP3P) || - Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi || - Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) { + if (((Desc.TSFlags & SIInstrFlags::VOP3P) || + Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi || + Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) && + // Matrix B reuse operand reuses op_sel_hi. + !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) { Encoding |= getImplicitOpSelHiEncoding(Opcode); } @@ -562,7 +564,8 @@ static bool needsPCRel(const MCExpr *Expr) { case MCExpr::SymbolRef: { auto *SE = cast<MCSymbolRefExpr>(Expr); auto Spec = AMDGPU::getSpecifier(SE); - return Spec != AMDGPUMCExpr::S_ABS32_LO && Spec != AMDGPUMCExpr::S_ABS32_HI; + return Spec != AMDGPUMCExpr::S_ABS32_LO && + Spec != AMDGPUMCExpr::S_ABS32_HI && Spec != AMDGPUMCExpr::S_ABS64; } case MCExpr::Binary: { auto *BE = cast<MCBinaryExpr>(Expr); @@ -685,7 +688,12 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon( const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); uint32_t Offset = Desc.getSize(); assert(Offset == 4 || Offset == 8); - addFixup(Fixups, Offset, MO.getExpr(), FK_Data_4, PCRel); + auto OpType = Desc.operands()[OpNo].OperandType; + MCFixupKind Kind = (STI.hasFeature(AMDGPU::Feature64BitLiterals) && + OpType == AMDGPU::OPERAND_REG_IMM_INT64) + ? FK_Data_8 + : FK_Data_4; + addFixup(Fixups, Offset, MO.getExpr(), Kind, PCRel); } const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index e1b9720cdbfc..bc6fdf7f2e4c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -50,6 +50,7 @@ public: S_REL64, // symbol@rel64 S_ABS32_LO, // symbol@abs32@lo S_ABS32_HI, // symbol@abs32@hi + S_ABS64, // symbol@abs64 }; private: diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 9b5a46395695..f018f77bc83e 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -378,6 +378,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, default: return false; case AMDGPU::V_MOV_B32_e32: + case AMDGPU::AV_MOV_B32_IMM_PSEUDO: SMovOp = AMDGPU::S_MOV_B32; break; case AMDGPU::V_MOV_B64_PSEUDO: @@ -946,13 +947,18 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { // Copies and REG_SEQUENCE do not contribute to the final assembly // So, skip them but take care of the SGPR to VGPR copies bookkeeping. - if (Inst->isCopy() || Inst->isRegSequence()) { - if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { - if (!Inst->isCopy() || - !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { - Info.NumSVCopies++; - continue; - } + if (Inst->isRegSequence() && + TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { + Info.NumSVCopies++; + continue; + } + if (Inst->isCopy()) { + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(*Inst, *TRI, *MRI); + if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI) && + !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { + Info.NumSVCopies++; + continue; } } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 0ed06c37507a..e172c0b63189 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1761,6 +1761,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, for (MachineInstr *Copy : CopiesToReplace) Copy->addImplicitDefUseOperands(*MF); + SetVector<MachineInstr *> ConstantFoldCandidates; for (FoldCandidate &Fold : FoldList) { assert(!Fold.isReg() || Fold.Def.OpToFold); if (Fold.isReg() && Fold.getReg().isVirtual()) { @@ -1783,16 +1784,21 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, << static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI); - if (Fold.isImm() && tryConstantFoldOp(Fold.UseMI)) { - LLVM_DEBUG(dbgs() << "Constant folded " << *Fold.UseMI); - Changed = true; - } + if (Fold.isImm()) + ConstantFoldCandidates.insert(Fold.UseMI); } else if (Fold.Commuted) { // Restoring instruction's original operand order if fold has failed. TII->commuteInstruction(*Fold.UseMI, false); } } + + for (MachineInstr *MI : ConstantFoldCandidates) { + if (tryConstantFoldOp(MI)) { + LLVM_DEBUG(dbgs() << "Constant folded " << *MI); + Changed = true; + } + } return true; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e2a10be4c2c7..0c76ff2ec5ea 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -46,6 +47,7 @@ #include <optional> using namespace llvm; +using namespace llvm::SDPatternMatch; #define DEBUG_TYPE "si-lower" @@ -938,6 +940,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal); } + if (Subtarget->hasBF16TransInsts()) { + setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal); + } + if (Subtarget->hasCvtPkF16F32Inst()) { setOperationAction(ISD::FP_ROUND, {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16}, @@ -3893,7 +3899,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // arguments to begin at SP+0. Completely unused for non-tail calls. int32_t FPDiff = 0; MachineFrameInfo &MFI = MF.getFrameInfo(); - auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + auto *TRI = Subtarget->getRegisterInfo(); // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass @@ -8162,6 +8168,14 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, // which is a 64-bit pc-relative offset from the encoding of the $symbol // operand to the global variable. + if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) { + assert(GAFlags != SIInstrInfo::MO_NONE); + + SDValue Ptr = + DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2); + return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr); + } + SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags); SDValue PtrHi; if (GAFlags == SIInstrInfo::MO_NONE) @@ -8211,6 +8225,13 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, } if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) { + if (Subtarget->has64BitLiterals()) { + SDValue Addr = DAG.getTargetGlobalAddress( + GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64); + return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr), + 0); + } + SDValue AddrLo = DAG.getTargetGlobalAddress( GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO); AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0}; @@ -9289,7 +9310,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_reloc_constant: { - Module *M = const_cast<Module *>(MF.getFunction().getParent()); + Module *M = MF.getFunction().getParent(); const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD(); auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); auto *RelocSymbol = cast<GlobalVariable>( @@ -9315,6 +9336,44 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), IndexKeyi32); } + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: { + if (Op.getOperand(4).getValueType() == MVT::i64) + return SDValue(); + + SDLoc SL(Op); + auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), + {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), IndexKeyi64, Op.getOperand(5), + Op.getOperand(6)}); + } + case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: { + EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8 + ? MVT::i64 + : MVT::i32; + if (Op.getOperand(6).getValueType() == IndexKeyTy) + return SDValue(); + + SDLoc SL(Op); + auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), + {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), + IndexKey, Op.getOperand(7), + Op.getOperand(8)}); // No clamp operand + } case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { @@ -11074,7 +11133,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { assert(VT.getSizeInBits() == 64); SDLoc DL(Op); - SDValue Cond = Op.getOperand(0); + SDValue Cond = DAG.getFreeze(Op.getOperand(0)); SDValue Zero = DAG.getConstant(0, DL, MVT::i32); SDValue One = DAG.getConstant(1, DL, MVT::i32); @@ -12155,6 +12214,11 @@ SDValue SITargetLowering::splitBinaryBitConstantOp( if ((bitOpWithConstantIsReducible(Opc, ValLo) || bitOpWithConstantIsReducible(Opc, ValHi)) || (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { + // We have 64-bit scalar and/or/xor, but do not have vector forms. + if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() && + !CRHS->user_begin()->isDivergent()) + return SDValue(); + // If we need to materialize a 64-bit immediate, it will be split up later // anyway. Avoid creating the harder to understand 64-bit immediate // materialization. @@ -13660,6 +13724,7 @@ bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF, case Intrinsic::amdgcn_frexp_mant: case Intrinsic::amdgcn_fdot2: case Intrinsic::amdgcn_trig_preop: + case Intrinsic::amdgcn_tanh: return true; default: break; @@ -14498,7 +14563,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, // instead of a tree. SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const { - assert(N->getOpcode() == ISD::ADD); + assert(N->isAnyAdd()); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -14531,7 +14596,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, for (SDNode *User : LHS->users()) { // There is a use that does not feed into addition, so the multiply can't // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. - if (User->getOpcode() != ISD::ADD) + if (!User->isAnyAdd()) return SDValue(); // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer @@ -14643,8 +14708,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, SDValue Hi = getHiHalf64(LHS, DAG); SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::PTRADD) + Opcode = ISD::ADD; SDValue AddHi = - DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); + DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags()); SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); @@ -15118,42 +15186,123 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N1.getOpcode() == ISD::ADD) { - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, - // y is not, and (add y, z) is used only once. - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, - // z is not, and (add y, z) is used only once. - // The goal is to move constant offsets to the outermost ptradd, to create - // more opportunities to fold offsets into memory instructions. - // Together with the generic combines in DAGCombiner.cpp, this also - // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). - // - // This transform is here instead of in the general DAGCombiner as it can - // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for - // AArch64's CPA. - SDValue X = N0; - SDValue Y = N1.getOperand(0); - SDValue Z = N1.getOperand(1); - if (N1.hasOneUse()) { - bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); - bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); - if (ZIsConstant != YIsConstant) { - // If both additions in the original were NUW, the new ones are as well. - SDNodeFlags Flags = - (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; - if (YIsConstant) - std::swap(Y, Z); + // The following folds transform PTRADDs into regular arithmetic in cases + // where the PTRADD wouldn't be folded as an immediate offset into memory + // instructions anyway. They are target-specific in that other targets might + // prefer to not lose information about the pointer arithmetic. + + // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)). + // Adapted from DAGCombiner::visitADDLikeCommutative. + SDValue V, K; + if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) { + SDNodeFlags ShlFlags = N1->getFlags(); + // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0, + // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be + // preserved. + SDNodeFlags NewShlFlags = + ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap() + ? SDNodeFlags::NoSignedWrap + : SDNodeFlags(); + SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags); + DCI.AddToWorklist(Inner.getNode()); + return DAG.getNode(ISD::SUB, DL, VT, N0, Inner); + } + + // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in + // performAddCombine. + if (N1.getOpcode() == ISD::MUL) { + if (Subtarget->hasMad64_32()) { + if (SDValue Folded = tryFoldToMad64_32(N, DCI)) + return Folded; + } + } + + // If the 32 low bits of the constant are all zero, there is nothing to fold + // into an immediate offset, so it's better to eliminate the unnecessary + // addition for the lower 32 bits than to preserve the PTRADD. + // Analogous to a fold in performAddCombine. + if (VT == MVT::i64) { + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) + return Folded; + } - SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags); + if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) { + // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with + // global address GA and constant c, such that c can be folded into GA. + SDValue GAValue = N0.getOperand(0); + if (const GlobalAddressSDNode *GA = + dyn_cast<GlobalAddressSDNode>(GAValue)) { + if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) { + // If both additions in the original were NUW, reassociation preserves + // that. + SDNodeFlags Flags = + (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap; + SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags); DCI.AddToWorklist(Inner.getNode()); - return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags); + return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags); } } } + if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse()) + return SDValue(); + + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, + // y is not, and (add y, z) is used only once. + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, + // z is not, and (add y, z) is used only once. + // The goal is to move constant offsets to the outermost ptradd, to create + // more opportunities to fold offsets into memory instructions. + // Together with the generic combines in DAGCombiner.cpp, this also + // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). + // + // This transform is here instead of in the general DAGCombiner as it can + // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for + // AArch64's CPA. + SDValue X = N0; + SDValue Y = N1.getOperand(0); + SDValue Z = N1.getOperand(1); + bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); + bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + + // If both additions in the original were NUW, reassociation preserves that. + SDNodeFlags ReassocFlags = + (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; + + if (ZIsConstant != YIsConstant) { + if (YIsConstant) + std::swap(Y, Z); + SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); + DCI.AddToWorklist(Inner.getNode()); + return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags); + } + + // If one of Y and Z is constant, they have been handled above. If both were + // constant, the addition would have been folded in SelectionDAG::getNode + // already. This ensures that the generic DAG combines won't undo the + // following reassociation. + assert(!YIsConstant && !ZIsConstant); + + if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) { + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and + // y are uniform and z isn't. + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and + // z are uniform and y isn't. + // The goal is to push uniform operands up in the computation, so that they + // can be handled with scalar operations. We can't use reassociateScalarOps + // for this since it requires two identical commutative operations to + // reassociate. + if (Y->isDivergent()) + std::swap(Y, Z); + SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); + DCI.AddToWorklist(UniformInner.getNode()); + return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags); + } + return SDValue(); } @@ -16847,12 +16996,63 @@ static void knownBitsForWorkitemID(const GCNSubtarget &ST, Known.Zero.setHighBits(llvm::countl_zero(MaxValue)); } +static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, + KnownBits &Known, const APInt &DemandedElts, + unsigned BFEWidth, bool SExt, unsigned Depth) { + const MachineRegisterInfo &MRI = VT.getMachineFunction().getRegInfo(); + const MachineOperand &Src1 = MI.getOperand(2); + + unsigned Src1Cst = 0; + if (Src1.isImm()) { + Src1Cst = Src1.getImm(); + } else if (Src1.isReg()) { + auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI); + if (!Cst) + return; + Src1Cst = Cst->Value.getZExtValue(); + } else { + return; + } + + // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit. + // Width is always [22:16]. + const unsigned Offset = + Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6); + const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6); + + if (Width >= BFEWidth) // Ill-formed. + return; + + VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts, + Depth + 1); + + Known = Known.extractBits(Width, Offset); + + if (SExt) + Known = Known.sext(BFEWidth); + else + Known = Known.zext(BFEWidth); +} + void SITargetLowering::computeKnownBitsForTargetInstr( GISelValueTracking &VT, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth) const { + Known.resetAll(); const MachineInstr *MI = MRI.getVRegDef(R); switch (MI->getOpcode()) { + case AMDGPU::S_BFE_I32: + return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32, + /*SExt=*/true, Depth); + case AMDGPU::S_BFE_U32: + return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32, + /*SExt=*/false, Depth); + case AMDGPU::S_BFE_I64: + return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64, + /*SExt=*/true, Depth); + case AMDGPU::S_BFE_U64: + return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64, + /*SExt=*/false, Depth); case AMDGPU::G_INTRINSIC: case AMDGPU::G_INTRINSIC_CONVERGENT: { Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID(); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7ce1359f03da..2af0a575a888 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -260,240 +260,7 @@ InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { llvm_unreachable("event type has no associated counter"); } -// This objects maintains the current score brackets of each wait counter, and -// a per-register scoreboard for each wait counter. -// -// We also maintain the latest score for every event type that can change the -// waitcnt in order to know if there are multiple types of events within -// the brackets. When multiple types of event happen in the bracket, -// wait count may get decreased out of order, therefore we need to put in -// "s_waitcnt 0" before use. -class WaitcntBrackets { -public: - WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, - HardwareLimits Limits, const unsigned *WaitEventMaskForInst, - InstCounterType SmemAccessCounter) - : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), - WaitEventMaskForInst(WaitEventMaskForInst), - SmemAccessCounter(SmemAccessCounter) {} - - unsigned getWaitCountMax(InstCounterType T) const { - switch (T) { - case LOAD_CNT: - return Limits.LoadcntMax; - case DS_CNT: - return Limits.DscntMax; - case EXP_CNT: - return Limits.ExpcntMax; - case STORE_CNT: - return Limits.StorecntMax; - case SAMPLE_CNT: - return Limits.SamplecntMax; - case BVH_CNT: - return Limits.BvhcntMax; - case KM_CNT: - return Limits.KmcntMax; - case X_CNT: - return Limits.XcntMax; - default: - break; - } - return 0; - } - - bool isSmemCounter(InstCounterType T) const { - return T == SmemAccessCounter || T == X_CNT; - } - - unsigned getSgprScoresIdx(InstCounterType T) const { - assert(isSmemCounter(T) && "Invalid SMEM counter"); - return T == X_CNT ? 1 : 0; - } - - unsigned getScoreLB(InstCounterType T) const { - assert(T < NUM_INST_CNTS); - return ScoreLBs[T]; - } - - unsigned getScoreUB(InstCounterType T) const { - assert(T < NUM_INST_CNTS); - return ScoreUBs[T]; - } - - unsigned getScoreRange(InstCounterType T) const { - return getScoreUB(T) - getScoreLB(T); - } - - unsigned getRegScore(int GprNo, InstCounterType T) const { - if (GprNo < NUM_ALL_VGPRS) - return VgprScores[T][GprNo]; - return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; - } - - bool merge(const WaitcntBrackets &Other); - - RegInterval getRegInterval(const MachineInstr *MI, - const MachineRegisterInfo *MRI, - const SIRegisterInfo *TRI, - const MachineOperand &Op) const; - - bool counterOutOfOrder(InstCounterType T) const; - void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; - void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; - - void determineWait(InstCounterType T, RegInterval Interval, - AMDGPU::Waitcnt &Wait) const; - void determineWait(InstCounterType T, int RegNo, - AMDGPU::Waitcnt &Wait) const { - determineWait(T, {RegNo, RegNo + 1}, Wait); - } - - void applyWaitcnt(const AMDGPU::Waitcnt &Wait); - void applyWaitcnt(InstCounterType T, unsigned Count); - void applyXcnt(const AMDGPU::Waitcnt &Wait); - void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, WaitEventType E, - MachineInstr &MI); - - unsigned hasPendingEvent() const { return PendingEvents; } - unsigned hasPendingEvent(WaitEventType E) const { - return PendingEvents & (1 << E); - } - unsigned hasPendingEvent(InstCounterType T) const { - unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; - assert((HasPending != 0) == (getScoreRange(T) != 0)); - return HasPending; - } - - bool hasMixedPendingEvents(InstCounterType T) const { - unsigned Events = hasPendingEvent(T); - // Return true if more than one bit is set in Events. - return Events & (Events - 1); - } - - bool hasPendingFlat() const { - return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && - LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || - (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && - LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); - } - - void setPendingFlat() { - LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; - LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; - } - - bool hasPendingGDS() const { - return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT]; - } - - unsigned getPendingGDSWait() const { - return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1); - } - - void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } - - // Return true if there might be pending writes to the vgpr-interval by VMEM - // instructions with types different from V. - bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - if (VgprVmemTypes[RegNo] & ~(1 << V)) - return true; - } - return false; - } - - void clearVgprVmemTypes(RegInterval Interval) { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - VgprVmemTypes[RegNo] = 0; - } - } - - void setStateOnFunctionEntryOrReturn() { - setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT)); - PendingEvents |= WaitEventMaskForInst[STORE_CNT]; - } - - ArrayRef<const MachineInstr *> getLDSDMAStores() const { - return LDSDMAStores; - } - - bool hasPointSampleAccel(const MachineInstr &MI) const; - bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, - RegInterval Interval) const; - - void print(raw_ostream &) const; - void dump() const { print(dbgs()); } - -private: - struct MergeInfo { - unsigned OldLB; - unsigned OtherLB; - unsigned MyShift; - unsigned OtherShift; - }; - static bool mergeScore(const MergeInfo &M, unsigned &Score, - unsigned OtherScore); - - void setScoreLB(InstCounterType T, unsigned Val) { - assert(T < NUM_INST_CNTS); - ScoreLBs[T] = Val; - } - - void setScoreUB(InstCounterType T, unsigned Val) { - assert(T < NUM_INST_CNTS); - ScoreUBs[T] = Val; - - if (T != EXP_CNT) - return; - - if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT)) - ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); - } - - void setRegScore(int GprNo, InstCounterType T, unsigned Val) { - setScoreByInterval({GprNo, GprNo + 1}, T, Val); - } - - void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, - unsigned Score); - - void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, - const MachineOperand &Op, InstCounterType CntTy, - unsigned Val); - - const GCNSubtarget *ST = nullptr; - InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; - HardwareLimits Limits = {}; - const unsigned *WaitEventMaskForInst; - InstCounterType SmemAccessCounter; - unsigned ScoreLBs[NUM_INST_CNTS] = {0}; - unsigned ScoreUBs[NUM_INST_CNTS] = {0}; - unsigned PendingEvents = 0; - // Remember the last flat memory operation. - unsigned LastFlat[NUM_INST_CNTS] = {0}; - // Remember the last GDS operation. - unsigned LastGDS = 0; - // wait_cnt scores for every vgpr. - // Keep track of the VgprUB and SgprUB to make merge at join efficient. - int VgprUB = -1; - int SgprUB = -1; - unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; - // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt - // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. - // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the - // X_CNT score. - unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; - // Bitmask of the VmemTypes of VMEM instructions that might have a pending - // write to each vgpr. - unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; - // Store representative LDS DMA operations. The only useful info here is - // alias info. One store is kept per unique AAInfo. - SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores; -}; +class WaitcntBrackets; // This abstracts the logic for generating and updating S_WAIT* instructions // away from the analysis that determines where they are needed. This was @@ -640,8 +407,13 @@ public: }; class SIInsertWaitcnts { +public: + const GCNSubtarget *ST; + InstCounterType SmemAccessCounter; + InstCounterType MaxCounter; + const unsigned *WaitEventMaskForInst; + private: - const GCNSubtarget *ST = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI = nullptr; @@ -657,8 +429,6 @@ private: bool Dirty = true; }; - InstCounterType SmemAccessCounter; - MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; bool ForceEmitWaitcnt[NUM_INST_CNTS]; @@ -675,7 +445,7 @@ private: // message. DenseSet<MachineInstr *> ReleaseVGPRInsts; - InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; + HardwareLimits Limits; public: SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT, @@ -686,6 +456,30 @@ public: (void)ForceVMCounter; } + unsigned getWaitCountMax(InstCounterType T) const { + switch (T) { + case LOAD_CNT: + return Limits.LoadcntMax; + case DS_CNT: + return Limits.DscntMax; + case EXP_CNT: + return Limits.ExpcntMax; + case STORE_CNT: + return Limits.StorecntMax; + case SAMPLE_CNT: + return Limits.SamplecntMax; + case BVH_CNT: + return Limits.BvhcntMax; + case KM_CNT: + return Limits.KmcntMax; + case X_CNT: + return Limits.XcntMax; + default: + break; + } + return 0; + } + bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets); bool isPreheaderToFlush(MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets); @@ -791,6 +585,211 @@ public: WaitcntBrackets &ScoreBrackets); }; +// This objects maintains the current score brackets of each wait counter, and +// a per-register scoreboard for each wait counter. +// +// We also maintain the latest score for every event type that can change the +// waitcnt in order to know if there are multiple types of events within +// the brackets. When multiple types of event happen in the bracket, +// wait count may get decreased out of order, therefore we need to put in +// "s_waitcnt 0" before use. +class WaitcntBrackets { +public: + WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {} + + bool isSmemCounter(InstCounterType T) const { + return T == Context->SmemAccessCounter || T == X_CNT; + } + + unsigned getSgprScoresIdx(InstCounterType T) const { + assert(isSmemCounter(T) && "Invalid SMEM counter"); + return T == X_CNT ? 1 : 0; + } + + unsigned getScoreLB(InstCounterType T) const { + assert(T < NUM_INST_CNTS); + return ScoreLBs[T]; + } + + unsigned getScoreUB(InstCounterType T) const { + assert(T < NUM_INST_CNTS); + return ScoreUBs[T]; + } + + unsigned getScoreRange(InstCounterType T) const { + return getScoreUB(T) - getScoreLB(T); + } + + unsigned getRegScore(int GprNo, InstCounterType T) const { + if (GprNo < NUM_ALL_VGPRS) + return VgprScores[T][GprNo]; + return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; + } + + bool merge(const WaitcntBrackets &Other); + + RegInterval getRegInterval(const MachineInstr *MI, + const MachineRegisterInfo *MRI, + const SIRegisterInfo *TRI, + const MachineOperand &Op) const; + + bool counterOutOfOrder(InstCounterType T) const; + void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; + void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + + void determineWait(InstCounterType T, RegInterval Interval, + AMDGPU::Waitcnt &Wait) const; + void determineWait(InstCounterType T, int RegNo, + AMDGPU::Waitcnt &Wait) const { + determineWait(T, {RegNo, RegNo + 1}, Wait); + } + + void applyWaitcnt(const AMDGPU::Waitcnt &Wait); + void applyWaitcnt(InstCounterType T, unsigned Count); + void applyXcnt(const AMDGPU::Waitcnt &Wait); + void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, WaitEventType E, + MachineInstr &MI); + + unsigned hasPendingEvent() const { return PendingEvents; } + unsigned hasPendingEvent(WaitEventType E) const { + return PendingEvents & (1 << E); + } + unsigned hasPendingEvent(InstCounterType T) const { + unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T]; + assert((HasPending != 0) == (getScoreRange(T) != 0)); + return HasPending; + } + + bool hasMixedPendingEvents(InstCounterType T) const { + unsigned Events = hasPendingEvent(T); + // Return true if more than one bit is set in Events. + return Events & (Events - 1); + } + + bool hasPendingFlat() const { + return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && + LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || + (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && + LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); + } + + void setPendingFlat() { + LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; + LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; + } + + bool hasPendingGDS() const { + return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT]; + } + + unsigned getPendingGDSWait() const { + return std::min(getScoreUB(DS_CNT) - LastGDS, + Context->getWaitCountMax(DS_CNT) - 1); + } + + void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } + + // Return true if there might be pending writes to the vgpr-interval by VMEM + // instructions with types different from V. + bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + assert(RegNo < NUM_ALL_VGPRS); + if (VgprVmemTypes[RegNo] & ~(1 << V)) + return true; + } + return false; + } + + void clearVgprVmemTypes(RegInterval Interval) { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + assert(RegNo < NUM_ALL_VGPRS); + VgprVmemTypes[RegNo] = 0; + } + } + + void setStateOnFunctionEntryOrReturn() { + setScoreUB(STORE_CNT, + getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT)); + PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT]; + } + + ArrayRef<const MachineInstr *> getLDSDMAStores() const { + return LDSDMAStores; + } + + bool hasPointSampleAccel(const MachineInstr &MI) const; + bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, + RegInterval Interval) const; + + void print(raw_ostream &) const; + void dump() const { print(dbgs()); } + +private: + struct MergeInfo { + unsigned OldLB; + unsigned OtherLB; + unsigned MyShift; + unsigned OtherShift; + }; + static bool mergeScore(const MergeInfo &M, unsigned &Score, + unsigned OtherScore); + + void setScoreLB(InstCounterType T, unsigned Val) { + assert(T < NUM_INST_CNTS); + ScoreLBs[T] = Val; + } + + void setScoreUB(InstCounterType T, unsigned Val) { + assert(T < NUM_INST_CNTS); + ScoreUBs[T] = Val; + + if (T != EXP_CNT) + return; + + if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT)) + ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT); + } + + void setRegScore(int GprNo, InstCounterType T, unsigned Val) { + setScoreByInterval({GprNo, GprNo + 1}, T, Val); + } + + void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, + unsigned Score); + + void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, + const MachineOperand &Op, InstCounterType CntTy, + unsigned Val); + + const SIInsertWaitcnts *Context; + + unsigned ScoreLBs[NUM_INST_CNTS] = {0}; + unsigned ScoreUBs[NUM_INST_CNTS] = {0}; + unsigned PendingEvents = 0; + // Remember the last flat memory operation. + unsigned LastFlat[NUM_INST_CNTS] = {0}; + // Remember the last GDS operation. + unsigned LastGDS = 0; + // wait_cnt scores for every vgpr. + // Keep track of the VgprUB and SgprUB to make merge at join efficient. + int VgprUB = -1; + int SgprUB = -1; + unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; + // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt + // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. + // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the + // X_CNT score. + unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; + // Bitmask of the VmemTypes of VMEM instructions that might have a pending + // write to each vgpr. + unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; + // Store representative LDS DMA operations. The only useful info here is + // alias info. One store is kept per unique AAInfo. + SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores; +}; + class SIInsertWaitcntsLegacy : public MachineFunctionPass { public: static char ID; @@ -827,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, RegInterval Result; - MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST); + MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST); unsigned RegIdx = TRI->getHWRegIndex(MCReg); assert(isUInt<8>(RegIdx)); @@ -885,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI, // this at compile time, so we have to assume it might be applied if the // instruction supports it). bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const { - if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI)) + if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI)) return false; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); @@ -911,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &Inst) { - InstCounterType T = eventCounter(WaitEventMaskForInst, E); + InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E); unsigned UB = getScoreUB(T); unsigned CurrScore = UB + 1; @@ -1080,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } void WaitcntBrackets::print(raw_ostream &OS) const { + const GCNSubtarget *ST = Context->ST; + OS << '\n'; - for (auto T : inst_counter_types(MaxCounter)) { + for (auto T : inst_counter_types(Context->MaxCounter)) { unsigned SR = getScoreRange(T); switch (T) { @@ -1195,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, // s_waitcnt instruction. if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && - !ST->hasFlatLgkmVMemCountInOrder()) { + !Context->ST->hasFlatLgkmVMemCountInOrder()) { // If there is a pending FLAT operation, and this is a VMem or LGKM // waitcnt and the target can report early completion, then we need // to force a waitcnt 0. @@ -1209,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, // If a counter has been maxed out avoid overflow by waiting for // MAX(CounterType) - 1 instead. unsigned NeededWait = - std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); + std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1); addWait(Wait, T, NeededWait); } } @@ -1237,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { setScoreLB(T, std::max(getScoreLB(T), UB - Count)); } else { setScoreLB(T, UB); - PendingEvents &= ~WaitEventMaskForInst[T]; + PendingEvents &= ~Context->WaitEventMaskForInst[T]; } } @@ -1262,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { // the decrement may go out of order. bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { // Scalar memory read always can go out of order. - if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || + if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || (T == X_CNT && hasPendingEvent(SMEM_GROUP))) return true; return hasMixedPendingEvents(T); @@ -2386,8 +2387,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { VgprUB = std::max(VgprUB, Other.VgprUB); SgprUB = std::max(SgprUB, Other.SgprUB); - for (auto T : inst_counter_types(MaxCounter)) { + for (auto T : inst_counter_types(Context->MaxCounter)) { // Merge event flags for this counter + const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst; const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; if (OtherEvents & ~OldEvents) @@ -2746,11 +2748,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; - const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask(); + WaitEventMaskForInst = WCG->getWaitEventMask(); SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS); - HardwareLimits Limits = {}; if (ST->hasExtendedWaitCounts()) { Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV); Limits.DscntMax = AMDGPU::getDscntBitMask(IV); @@ -2807,8 +2808,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); } - auto NonKernelInitialState = std::make_unique<WaitcntBrackets>( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this); NonKernelInitialState->setStateOnFunctionEntryOrReturn(); BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); @@ -2839,15 +2839,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { *Brackets = *BI.Incoming; } else { if (!Brackets) { - Brackets = std::make_unique<WaitcntBrackets>( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + Brackets = std::make_unique<WaitcntBrackets>(this); } else { // Reinitialize in-place. N.B. do not do this by assigning from a // temporary because the WaitcntBrackets class is large and it could // cause this function to use an unreasonable amount of stack space. Brackets->~WaitcntBrackets(); - new (Brackets.get()) WaitcntBrackets( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + new (Brackets.get()) WaitcntBrackets(this); } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index ca3af3b48a60..c8935f0cb603 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -687,7 +687,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, if (!SafeToPropagate) break; - DefOp.setIsKill(false); + for (auto I = Def; I != MI; ++I) + I->clearRegisterKills(DefOp.getReg(), &RI); } MachineInstrBuilder Builder = @@ -1625,41 +1626,6 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) { } } -static unsigned getAGPRSpillSaveOpcode(unsigned Size) { - switch (Size) { - case 4: - return AMDGPU::SI_SPILL_A32_SAVE; - case 8: - return AMDGPU::SI_SPILL_A64_SAVE; - case 12: - return AMDGPU::SI_SPILL_A96_SAVE; - case 16: - return AMDGPU::SI_SPILL_A128_SAVE; - case 20: - return AMDGPU::SI_SPILL_A160_SAVE; - case 24: - return AMDGPU::SI_SPILL_A192_SAVE; - case 28: - return AMDGPU::SI_SPILL_A224_SAVE; - case 32: - return AMDGPU::SI_SPILL_A256_SAVE; - case 36: - return AMDGPU::SI_SPILL_A288_SAVE; - case 40: - return AMDGPU::SI_SPILL_A320_SAVE; - case 44: - return AMDGPU::SI_SPILL_A352_SAVE; - case 48: - return AMDGPU::SI_SPILL_A384_SAVE; - case 64: - return AMDGPU::SI_SPILL_A512_SAVE; - case 128: - return AMDGPU::SI_SPILL_A1024_SAVE; - default: - llvm_unreachable("unknown register size"); - } -} - static unsigned getAVSpillSaveOpcode(unsigned Size) { switch (Size) { case 4: @@ -1707,22 +1673,20 @@ static unsigned getWWMRegSpillSaveOpcode(unsigned Size, return AMDGPU::SI_SPILL_WWM_V32_SAVE; } -static unsigned getVectorRegSpillSaveOpcode(Register Reg, - const TargetRegisterClass *RC, - unsigned Size, - const SIRegisterInfo &TRI, - const SIMachineFunctionInfo &MFI) { - bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); +unsigned SIInstrInfo::getVectorRegSpillSaveOpcode( + Register Reg, const TargetRegisterClass *RC, unsigned Size, + const SIMachineFunctionInfo &MFI) const { + bool IsVectorSuperClass = RI.isVectorSuperClass(RC); // Choose the right opcode if spilling a WWM register. if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass); - if (IsVectorSuperClass) + // TODO: Check if AGPRs are available + if (ST.hasMAIInsts()) return getAVSpillSaveOpcode(Size); - return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size) - : getVGPRSpillSaveOpcode(Size); + return getVGPRSpillSaveOpcode(Size); } void SIInstrInfo::storeRegToStackSlot( @@ -1770,8 +1734,8 @@ void SIInstrInfo::storeRegToStackSlot( return; } - unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, - SpillSize, RI, *MFI); + unsigned Opcode = + getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) @@ -1854,41 +1818,6 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { } } -static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { - switch (Size) { - case 4: - return AMDGPU::SI_SPILL_A32_RESTORE; - case 8: - return AMDGPU::SI_SPILL_A64_RESTORE; - case 12: - return AMDGPU::SI_SPILL_A96_RESTORE; - case 16: - return AMDGPU::SI_SPILL_A128_RESTORE; - case 20: - return AMDGPU::SI_SPILL_A160_RESTORE; - case 24: - return AMDGPU::SI_SPILL_A192_RESTORE; - case 28: - return AMDGPU::SI_SPILL_A224_RESTORE; - case 32: - return AMDGPU::SI_SPILL_A256_RESTORE; - case 36: - return AMDGPU::SI_SPILL_A288_RESTORE; - case 40: - return AMDGPU::SI_SPILL_A320_RESTORE; - case 44: - return AMDGPU::SI_SPILL_A352_RESTORE; - case 48: - return AMDGPU::SI_SPILL_A384_RESTORE; - case 64: - return AMDGPU::SI_SPILL_A512_RESTORE; - case 128: - return AMDGPU::SI_SPILL_A1024_RESTORE; - default: - llvm_unreachable("unknown register size"); - } -} - static unsigned getAVSpillRestoreOpcode(unsigned Size) { switch (Size) { case 4: @@ -1930,27 +1859,27 @@ static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, if (Size != 4) llvm_unreachable("unknown wwm register spill size"); - if (IsVectorSuperClass) + if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs return AMDGPU::SI_SPILL_WWM_AV32_RESTORE; return AMDGPU::SI_SPILL_WWM_V32_RESTORE; } -static unsigned -getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, - unsigned Size, const SIRegisterInfo &TRI, - const SIMachineFunctionInfo &MFI) { - bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); +unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode( + Register Reg, const TargetRegisterClass *RC, unsigned Size, + const SIMachineFunctionInfo &MFI) const { + bool IsVectorSuperClass = RI.isVectorSuperClass(RC); // Choose the right opcode if restoring a WWM register. if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass); - if (IsVectorSuperClass) + // TODO: Check if AGPRs are available + if (ST.hasMAIInsts()) return getAVSpillRestoreOpcode(Size); - return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size) - : getVGPRSpillRestoreOpcode(Size); + assert(!RI.isAGPRClass(RC)); + return getVGPRSpillRestoreOpcode(Size); } void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, @@ -1998,7 +1927,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, } unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC, - SpillSize, RI, *MFI); + SpillSize, *MFI); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset @@ -2214,7 +2143,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { if (ST.hasMovB64()) { MI.setDesc(get(AMDGPU::V_MOV_B64_e32)); if (SrcOp.isReg() || isInlineConstant(MI, 1) || - isUInt<32>(SrcOp.getImm())) + isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals()) break; } if (SrcOp.isImm()) { @@ -2273,6 +2202,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::S_MOV_B64_IMM_PSEUDO: { const MachineOperand &SrcOp = MI.getOperand(1); assert(!SrcOp.isFPImm()); + + if (ST.has64BitLiterals()) { + MI.setDesc(get(AMDGPU::S_MOV_B64)); + break; + } + APInt Imm(64, SrcOp.getImm()); if (Imm.isIntN(32) || isInlineConstant(Imm)) { MI.setDesc(get(AMDGPU::S_MOV_B64)); @@ -2492,6 +2427,25 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } + case AMDGPU::SI_PC_ADD_REL_OFFSET64: { + MachineFunction &MF = *MBB.getParent(); + Register Reg = MI.getOperand(0).getReg(); + MachineOperand Op = MI.getOperand(1); + + // Create a bundle so these instructions won't be re-ordered by the + // post-RA scheduler. + MIBundleBuilder Bundler(MBB, MI); + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); + if (Op.isGlobal()) + Op.setOffset(Op.getOffset() + 4); + Bundler.append( + BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op)); + + finalizeBundle(MBB, Bundler.begin()); + + MI.eraseFromParent(); + break; + } case AMDGPU::ENTER_STRICT_WWM: { // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when // Whole Wave Mode is entered. @@ -2807,12 +2761,14 @@ bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0, if ((int)OpIdx1 != Src0Idx && MO0->isReg()) { if (!DefinedRC1) return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN; - return isLegalRegOperand(MI, OpIdx1, *MO0); + return isLegalRegOperand(MI, OpIdx1, *MO0) && + (!MO1->isReg() || isLegalRegOperand(MI, OpIdx0, *MO1)); } if ((int)OpIdx0 != Src0Idx && MO1->isReg()) { if (!DefinedRC0) return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN; - return isLegalRegOperand(MI, OpIdx0, *MO1); + return (!MO0->isReg() || isLegalRegOperand(MI, OpIdx1, *MO0)) && + isLegalRegOperand(MI, OpIdx0, *MO1); } // No need to check 64-bit literals since swapping does not bring new @@ -2903,9 +2859,9 @@ bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc, bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, int64_t BrOffset) const { - // BranchRelaxation should never have to check s_setpc_b64 because its dest - // block is unanalyzable. - assert(BranchOp != AMDGPU::S_SETPC_B64); + // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64 + // because its dest block is unanalyzable. + assert(isSOPP(BranchOp) || isSOPK(BranchOp)); // Convert to dwords. BrOffset /= 4; @@ -2946,13 +2902,30 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + auto I = MBB.end(); + auto &MCCtx = MF->getContext(); + + if (ST.hasAddPC64Inst()) { + MCSymbol *Offset = + MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true); + auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64)) + .addSym(Offset, MO_FAR_BRANCH_OFFSET); + MCSymbol *PostAddPCLabel = + MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true); + AddPC->setPostInstrSymbol(*MF, PostAddPCLabel); + auto *OffsetExpr = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx), + MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx); + Offset->setVariableValue(OffsetExpr); + return; + } + + assert(RS && "RegScavenger required for long branching"); // FIXME: Virtual register workaround for RegScavenger not working with empty // blocks. Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - auto I = MBB.end(); - // Note: as this is used after hazard recognizer we need to apply some hazard // workarounds directly. const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) || @@ -2968,7 +2941,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); ApplyHazardWorkarounds(); - auto &MCCtx = MF->getContext(); MCSymbol *PostGetPCLabel = MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true); GetPC->setPostInstrSymbol(*MF, PostGetPCLabel); @@ -3507,6 +3479,10 @@ static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) { ? AMDGPU::V_FMAAK_F16_t16 : AMDGPU::V_FMAAK_F16_fake16 : AMDGPU::V_FMAAK_F16; + case AMDGPU::V_FMAC_F64_e32: + case AMDGPU::V_FMAC_F64_e64: + case AMDGPU::V_FMA_F64_e64: + return AMDGPU::V_FMAAK_F64; default: llvm_unreachable("invalid instruction"); } @@ -3535,6 +3511,10 @@ static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) { ? AMDGPU::V_FMAMK_F16_t16 : AMDGPU::V_FMAMK_F16_fake16 : AMDGPU::V_FMAMK_F16; + case AMDGPU::V_FMAC_F64_e32: + case AMDGPU::V_FMAC_F64_e64: + case AMDGPU::V_FMA_F64_e64: + return AMDGPU::V_FMAMK_F64; default: llvm_unreachable("invalid instruction"); } @@ -3613,7 +3593,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64) { + Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 || + Opc == AMDGPU::V_FMAC_F64_e64) { // Don't fold if we are using source or output modifiers. The new VOP2 // instructions don't have them. if (hasAnyModifiersSet(UseMI)) @@ -3685,7 +3666,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) + Opc == AMDGPU::V_FMAC_F16_fake16_e64 || + Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -3753,7 +3735,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) + Opc == AMDGPU::V_FMAC_F16_fake16_e64 || + Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -4074,8 +4057,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel); - if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 && - !IsLegacy && + if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy && + (!IsF64 || ST.hasFmaakFmamkF64Insts()) && // If we have an SGPR input, we will violate the constant bus restriction. (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { @@ -6099,14 +6082,18 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32; if (Is64BitOp && !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) { - if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp)) + if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) && + (!ST.has64BitLiterals() || InstDesc.getSize() != 4)) return false; // FIXME: We can use sign extended 64-bit literals, but only for signed // operands. At the moment we do not know if an operand is signed. // Such operand will be encoded as its low 32 bits and then either // correctly sign extended or incorrectly zero extended by HW. - if (!Is64BitFPOp && (int32_t)Imm < 0) + // If 64-bit literals are supported and the literal will be encoded + // as full 64 bit we still can use it. + if (!Is64BitFPOp && (int32_t)Imm < 0 && + (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false))) return false; } } @@ -6402,7 +6389,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { if (OldSAddrIdx < 0) return false; - assert(isSegmentSpecificFLAT(Inst)); + assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode())); int NewOpc = AMDGPU::getGlobalVaddrOp(Opc); if (NewOpc < 0) @@ -6426,7 +6413,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { if (OldVAddrIdx >= 0) { MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx); VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg()); - if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 || + if (!VAddrDef || !VAddrDef->isMoveImmediate() || !VAddrDef->getOperand(1).isImm() || VAddrDef->getOperand(1).getImm() != 0) return false; @@ -6479,7 +6466,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { // FIXME: Remove this when SelectionDAG is obsoleted. void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const { - if (!isSegmentSpecificFLAT(MI)) + if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode()) return; // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence @@ -9178,15 +9165,30 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { if (isDPP(MI)) return DescSize; bool HasLiteral = false; + unsigned LiteralSize = 4; for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { const MachineOperand &Op = MI.getOperand(I); const MCOperandInfo &OpInfo = Desc.operands()[I]; if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) { HasLiteral = true; + if (ST.has64BitLiterals()) { + switch (OpInfo.OperandType) { + default: + break; + case AMDGPU::OPERAND_REG_IMM_FP64: + if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true)) + LiteralSize = 8; + break; + case AMDGPU::OPERAND_REG_IMM_INT64: + if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false)) + LiteralSize = 8; + break; + } + } break; } } - return HasLiteral ? DescSize + 4 : DescSize; + return HasLiteral ? DescSize + LiteralSize : DescSize; } // Check whether we have extra NSA words. @@ -9277,13 +9279,16 @@ SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { ArrayRef<std::pair<unsigned, const char *>> SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { static const std::pair<unsigned, const char *> TargetFlags[] = { - { MO_GOTPCREL, "amdgpu-gotprel" }, - { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, - { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, - { MO_REL32_LO, "amdgpu-rel32-lo" }, - { MO_REL32_HI, "amdgpu-rel32-hi" }, - { MO_ABS32_LO, "amdgpu-abs32-lo" }, - { MO_ABS32_HI, "amdgpu-abs32-hi" }, + {MO_GOTPCREL, "amdgpu-gotprel"}, + {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"}, + {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"}, + {MO_GOTPCREL64, "amdgpu-gotprel64"}, + {MO_REL32_LO, "amdgpu-rel32-lo"}, + {MO_REL32_HI, "amdgpu-rel32-hi"}, + {MO_REL64, "amdgpu-rel64"}, + {MO_ABS32_LO, "amdgpu-abs32-lo"}, + {MO_ABS32_HI, "amdgpu-abs32-hi"}, + {MO_ABS64, "amdgpu-abs64"}, }; return ArrayRef(TargetFlags); @@ -10390,10 +10395,23 @@ bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const { return TargetInstrInfo::isGlobalMemoryObject(MI); } +bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const { + if (!isWMMA(MI) && !isSWMMAC(MI)) + return false; + + if (AMDGPU::isGFX1250(ST)) + return AMDGPU::getWMMAIsXDL(MI.getOpcode()); + + return true; +} + bool SIInstrInfo::isXDL(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); - if (!SIInstrInfo::isMAI(MI) || isDGEMM(Opcode) || + if (AMDGPU::isGFX12Plus(ST)) + return isDOT(MI) || isXDLWMMA(MI); + + if (!isMAI(MI) || isDGEMM(Opcode) || Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 9e84822bfc27..5e92921f3ea2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -33,6 +33,7 @@ class LiveVariables; class MachineDominatorTree; class MachineRegisterInfo; class RegScavenger; +class SIMachineFunctionInfo; class TargetRegisterClass; class ScheduleHazardRecognizer; @@ -214,16 +215,20 @@ public: MO_GOTPCREL32_LO = 2, // MO_GOTPCREL32_HI -> symbol@gotpcrel32@hi -> R_AMDGPU_GOTPCREL32_HI. MO_GOTPCREL32_HI = 3, + // MO_GOTPCREL64 -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL. + MO_GOTPCREL64 = 4, // MO_REL32_LO -> symbol@rel32@lo -> R_AMDGPU_REL32_LO. - MO_REL32 = 4, - MO_REL32_LO = 4, + MO_REL32 = 5, + MO_REL32_LO = 5, // MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI. - MO_REL32_HI = 5, + MO_REL32_HI = 6, + MO_REL64 = 7, - MO_FAR_BRANCH_OFFSET = 6, + MO_FAR_BRANCH_OFFSET = 8, - MO_ABS32_LO = 8, - MO_ABS32_HI = 9, + MO_ABS32_LO = 9, + MO_ABS32_HI = 10, + MO_ABS64 = 11, }; explicit SIInstrInfo(const GCNSubtarget &ST); @@ -283,6 +288,15 @@ public: bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override; + unsigned getVectorRegSpillSaveOpcode(Register Reg, + const TargetRegisterClass *RC, + unsigned Size, + const SIMachineFunctionInfo &MFI) const; + unsigned + getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, + unsigned Size, + const SIMachineFunctionInfo &MFI) const; + void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, @@ -863,6 +877,8 @@ public: return get(Opcode).TSFlags & SIInstrFlags::IsDOT; } + bool isXDLWMMA(const MachineInstr &MI) const; + bool isXDL(const MachineInstr &MI) const; static bool isDGEMM(unsigned Opcode) { return AMDGPU::getMAIIsDGEMM(Opcode); } @@ -1097,7 +1113,6 @@ public: // that will not require an additional 4-bytes; this function assumes that it // will. bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const { - assert(!MO.isReg() && "isInlineConstant called on register operand!"); if (!MO.isImm()) return false; return isInlineConstant(MO.getImm(), OperandType); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 5e41f875d980..9e1951e2946c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -268,6 +268,10 @@ def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET", SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]> >; +def SIpc_add_rel_offset64 : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET64", + SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]> +>; + def SIlds : SDNode<"AMDGPUISD::LDS", SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]> >; @@ -1247,6 +1251,7 @@ def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">; def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">; def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">; +def IndexKey32bit : CustomOperand<i32, 1>; def IndexKey16bit : CustomOperand<i32, 1>; def IndexKey8bit : CustomOperand<i32, 1>; @@ -1302,6 +1307,9 @@ let PrintMethod = "printBitOp3" in def BitOp3 : NamedIntOperand<"bitop3">; def bitop3_0 : DefaultOperand<BitOp3, 0>; +def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">; +def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">; + class KImmFPOperand<ValueType vt> : ImmOperand<vt> { let OperandNamespace = "AMDGPU"; let OperandType = "OPERAND_KIMM"#vt.Size; @@ -1633,6 +1641,8 @@ def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">; def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">; def VOP3PModsNeg : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">; +def VOP3PModsNegs : ComplexPattern<untyped, 1, "SelectVOP3PModsNegs">; // chfang: not use complex pattern? +def VOP3PModsNegAbs : ComplexPattern<untyped, 1, "SelectVOP3PModsNegAbs">; def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">; def WMMAModsF32NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">; @@ -1641,6 +1651,7 @@ def WMMAModsF16NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF16NegAbs">; def WMMAVISrc : ComplexPattern<untyped, 1, "SelectWMMAVISrc">; def SWMMACIndex8 : ComplexPattern<untyped, 2, "SelectSWMMACIndex8">; def SWMMACIndex16 : ComplexPattern<untyped, 2, "SelectSWMMACIndex16">; +def SWMMACIndex32 : ComplexPattern<untyped, 2, "SelectSWMMACIndex32">; def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">; @@ -2654,6 +2665,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { isModifierType<Src2VT>.ret, HasOMod); field bit HasNeg = HasModifiers; + field bit HasMatrixReuse = 0; field bit HasSrc0Mods = HasModifiers; field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0); @@ -2837,6 +2849,8 @@ def VOP_F16_F16 : VOPProfile<[f16, f16, untyped, untyped]>; def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>; def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>; +def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>; +def VOP1_I16_I32 : VOPProfile<[i16, i32, untyped, untyped]>; def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 4419ce00b473..991d9f83e92e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1144,6 +1144,14 @@ def : GCNPat < (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0)) >; +def SI_PC_ADD_REL_OFFSET64 : SPseudoInstSI < + (outs SReg_64:$dst), + (ins si_ga:$ptr), + [(set SReg_64:$dst, + (i64 (SIpc_add_rel_offset64 tglobaladdr:$ptr)))]> { + let SubtargetPredicate = Has64BitLiterals; +} + def : GCNPat< (AMDGPUtrap timm:$trapid), (S_TRAP $trapid) @@ -2465,7 +2473,6 @@ def : AMDGPUPat < >; let True16Predicate = NotHasTrue16BitInsts in { -let SubtargetPredicate = isNotGFX9Plus in { def : ROTRPattern <V_ALIGNBIT_B32_e64>; def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), @@ -2475,35 +2482,6 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; -} // isNotGFX9Plus - -let SubtargetPredicate = isGFX9GFX10 in { -def : GCNPat < - (rotr i32:$src0, i32:$src1), - (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src0, - /* src2_modifiers */ 0, - $src1, /* clamp */ 0, /* op_sel */ 0) ->; - -foreach pat = [(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), - (i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in -def : GCNPat<pat, - (V_ALIGNBIT_B32_opsel_e64 0, /* src0_modifiers */ - (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), - 0, /* src1_modifiers */ - (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), - 0, /* src2_modifiers */ - $src1, /* clamp */ 0, /* op_sel */ 0) ->; - -def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), - (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src1, - /* src2_modifiers */ 0, - $src2, /* clamp */ 0, /* op_sel */ 0) ->; -} // isGFX9GFX10 } // end True16Predicate = NotHasTrue16BitInsts let True16Predicate = UseRealTrue16Insts in { @@ -3104,8 +3082,6 @@ def : GCNPat < (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) >; -// This pattern for bswap is used for pre-GFX8. For GFX8+, bswap is mapped -// to V_PERM_B32. let True16Predicate = NotHasTrue16BitInsts in def : GCNPat < (i32 (bswap i32:$a)), @@ -3451,30 +3427,32 @@ def : GCNPat < (S_LSHL_B32 SReg_32:$src1, (i16 16)) >; +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in { def : GCNPat < (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))), (v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1)) >; - def : GCNPat < - (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))), - (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) + (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))), + (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) >; def : GCNPat < - (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))), - (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) + (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))), + (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) >; +} def : GCNPat < - (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), + (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))), (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) >; def : GCNPat < - (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))), - (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) + (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), + (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) >; foreach vecTy = [v2i16, v2f16, v2bf16] in { @@ -3581,20 +3559,15 @@ def : GCNPat < // Take the upper 16 bits from V[0] and the lower 16 bits from V[1] // Special case, can use V_ALIGNBIT (always uses encoded literal) -let True16Predicate = NotHasTrue16BitInsts in { -defvar BuildVectorToAlignBitPat = +let True16Predicate = NotHasTrue16BitInsts in +def : GCNPat < (vecTy (DivergentBinFrag<build_vector> (Ty !if(!eq(Ty, i16), (Ty (trunc (srl VGPR_32:$a, (i32 16)))), (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), - (Ty VGPR_32:$b))); - -let SubtargetPredicate = isNotGFX9Plus in -def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))>; - -let SubtargetPredicate = isGFX9GFX10 in -def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_opsel_e64 0, VGPR_32:$b, 0, VGPR_32:$a, 0, (i32 16), 0, 0)>; -} //True16Predicate = NotHasTrue16BitInsts + (Ty VGPR_32:$b))), + (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16)) +>; let True16Predicate = UseFakeTrue16Insts in def : GCNPat < diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index b0d6fd95cd27..5097ac03954d 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -2225,8 +2225,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = MI.getIterator(); ++MBBI; - const SITargetLowering *TLI = - static_cast<const SITargetLowering *>(STM->getTargetLowering()); + const SITargetLowering *TLI = STM->getTargetLowering(); for ( ; MBBI != E; ++MBBI) { MachineInstr &MINext = *MBBI; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 67ad28661da4..75ce67c00228 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -42,7 +42,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false), GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) { - const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI); + const GCNSubtarget &ST = *STI; FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); MaxNumWorkGroups = ST.getMaxNumWorkGroups(F); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 9173041a7bcc..fa2b8db6ba55 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -4052,11 +4052,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, return 0; } -unsigned -SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, - const TargetRegisterClass &RC) const { +unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, + const TargetRegisterClass &RC, + bool IncludeCalls) const { for (MCPhysReg Reg : reverse(RC.getRegisters())) - if (MRI.isPhysRegUsed(Reg)) + if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls)) return getHWRegIndex(Reg) + 1; return 0; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 06a7a17b0246..0008e5f8cf3b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -486,9 +486,11 @@ public: unsigned SubReg) const; // \returns a number of registers of a given \p RC used in a function. - // Does not go inside function calls. + // Does not go inside function calls. If \p IncludeCalls is true, it will + // include registers that may be clobbered by calls. unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, - const TargetRegisterClass &RC) const; + const TargetRegisterClass &RC, + bool IncludeCalls = true) const; std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override { return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index d24c301fc1e5..c194e5c255d4 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1294,6 +1294,7 @@ def VISrc_256_f32 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP32">; def VISrc_256_f64 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP64">; def VISrc_512_b32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_INT32">; def VISrc_512_f32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_512_f64 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP64">; def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_INT32">; def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_FP32">; diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 1679cee32006..ef8faffa5f55 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -66,6 +66,13 @@ def Write4PassDGEMM : SchedWrite; def Write8PassDGEMM : SchedWrite; def Write16PassDGEMM : SchedWrite; +// WMMA/SWMMA instructions +def WriteXDL2PassWMMA : SchedWrite; +def WriteXDL4PassWMMA : SchedWrite; +def Write4PassWMMA : SchedWrite; +def Write8PassWMMA : SchedWrite; +def Write16PassWMMA : SchedWrite; + // Scalar float instructions def WriteSFPU : SchedWrite; @@ -459,6 +466,15 @@ def : InstRW<[WriteCopy], (instrs COPY)>; multiclass GFX125xCommonWriteRes { +let ReleaseAtCycles = [8] in +def : HWWriteRes<WriteXDL2PassWMMA, [HWXDL], 8>; +let ReleaseAtCycles = [16] in +def : HWWriteRes<WriteXDL4PassWMMA, [HWXDL], 16>; + +def : HWWriteRes<Write4PassWMMA, [HWVALU], 16>; +def : HWWriteRes<Write8PassWMMA, [HWVALU], 32>; +def : HWWriteRes<Write16PassWMMA, [HWVALU], 64>; + def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>; def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>; def : HWWriteRes<WriteTrans32, [HWTransVALU, HWRC], 7>; @@ -476,6 +492,11 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>; def : HWWriteRes<WriteBarrier, [HWBranch], 2000>; def : InstRW<[WriteCopy], (instrs COPY)>; + +def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(FP8|BF8|BF16|F16)_w32")>; +def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)_w32")>; +def : InstRW<[Write4PassWMMA], (instregex "^V_WMMA_F32_16X16X4_F32_w32")>; +def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>; } // End GFX125xCommonWriteRes let SchedModel = GFX1250SpeedModel in { diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index fd39b8a1350c..7a519117f248 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -463,6 +463,10 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { case AMDGPU::V_FMA_F16_gfx9_fake16_e64: NewOpcode = AMDGPU::V_FMAAK_F16_fake16; break; + case AMDGPU::V_FMA_F64_e64: + if (ST->hasFmaakFmamkF64Insts()) + NewOpcode = AMDGPU::V_FMAAK_F64; + break; } } @@ -497,6 +501,10 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { case AMDGPU::V_FMA_F16_gfx9_fake16_e64: NewOpcode = AMDGPU::V_FMAMK_F16_fake16; break; + case AMDGPU::V_FMA_F64_e64: + if (ST->hasFmaakFmamkF64Insts()) + NewOpcode = AMDGPU::V_FMAMK_F64; + break; } } @@ -961,7 +969,9 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 || MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 || - MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) { + MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64 || + (MI.getOpcode() == AMDGPU::V_FMA_F64_e64 && + ST->hasFmaakFmamkF64Insts())) { shrinkMadFma(MI); continue; } @@ -1058,7 +1068,11 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { // fold an immediate into the shrunk instruction as a literal operand. In // GFX10 VOP3 instructions can take a literal operand anyway, so there is // no advantage to doing this. - if (ST->hasVOP3Literal() && !IsPostRA) + // However, if 64-bit literals are allowed we still need to shrink it + // for such literal to be able to fold. + if (ST->hasVOP3Literal() && + (!ST->has64BitLiterals() || AMDGPU::isTrue16Inst(MI.getOpcode())) && + !IsPostRA) continue; if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) && diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 2472b76fcf02..e103ccc2f00e 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -154,6 +154,10 @@ class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < let has_sdst = 0; } +class SOP1_1_REGIMM64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < + opName, (outs), (ins SSrc_b64:$src0), "$src0", pattern> { + let has_sdst = 0; +} class UniformUnaryFrag<SDPatternOperator Op> : PatFrag < (ops node:$src0), @@ -317,6 +321,9 @@ let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in { let isBranch = 1, isIndirectBranch = 1 in { def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">; + +let SubtargetPredicate = HasAddPC64Inst in +def S_ADD_PC_I64 : SOP1_1_REGIMM64 <"s_add_pc_i64">; } // End isBranch = 1, isIndirectBranch = 1 let isReturn = 1 in { @@ -2130,6 +2137,9 @@ defm S_GET_BARRIER_STATE_IMM : SOP1_IMM_Real_gfx12<0x050>; defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>; defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>; +// GFX1250 +defm S_ADD_PC_I64 : SOP1_Real_gfx12<0x04b>; + //===----------------------------------------------------------------------===// // SOP1 - GFX1150, GFX12 //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index a32078cc403e..77258810dd68 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -296,6 +296,7 @@ unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) { #define GET_MIMGOffsetMappingTable_IMPL #define GET_MIMGG16MappingTable_IMPL #define GET_MAIInstInfoTable_IMPL +#define GET_WMMAInstInfoTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, @@ -568,6 +569,11 @@ bool getMAIIsGFX940XDL(unsigned Opc) { return Info && Info->is_gfx940_xdl; } +bool getWMMAIsXDL(unsigned Opc) { + const WMMAInstInfo *Info = getWMMAInstInfoHelper(Opc); + return Info ? Info->is_wmma_xdl : false; +} + uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) { switch (EncodingVal) { case MFMAScaleFormats::FP6_E2M3: @@ -639,6 +645,7 @@ bool isMAC(unsigned Opc) { Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 || Opc == AMDGPU::V_MAC_F16_e64_vi || Opc == AMDGPU::V_FMAC_F64_e64_gfx90a || + Opc == AMDGPU::V_FMAC_F64_e64_gfx12 || Opc == AMDGPU::V_FMAC_F32_e64_gfx10 || Opc == AMDGPU::V_FMAC_F32_e64_gfx11 || Opc == AMDGPU::V_FMAC_F32_e64_gfx12 || diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 6708e0a3f454..c9d2c286bf23 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -119,6 +119,11 @@ struct True16D16Info { unsigned LoOp; }; +struct WMMAInstInfo { + uint16_t Opcode; + bool is_wmma_xdl; +}; + #define GET_MIMGBaseOpcode_DECL #define GET_MIMGDim_DECL #define GET_MIMGEncoding_DECL @@ -129,6 +134,7 @@ struct True16D16Info { #define GET_isMFMA_F8F6F4Table_DECL #define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL #define GET_True16D16Table_DECL +#define GET_WMMAInstInfoTable_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { @@ -593,6 +599,9 @@ bool getMAIIsDGEMM(unsigned Opc); LLVM_READONLY bool getMAIIsGFX940XDL(unsigned Opc); +LLVM_READONLY +bool getWMMAIsXDL(unsigned Opc); + // Get an equivalent BitOp3 for a binary logical \p Opc. // \returns BitOp3 modifier for the logical operation or zero. // Used in VOPD3 conversion. diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 211112e5262a..f621f8581f77 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>; let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>; defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>; + +let SubtargetPredicate = HasTanhInsts in +defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>; } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>; @@ -526,6 +529,21 @@ defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, AMDGPUlogf16>; defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>; defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; + +let SubtargetPredicate = HasTanhInsts in { +defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>; +} + +let SubtargetPredicate = HasBF16TransInsts in { +defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>; +defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; +defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>; +defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; +defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>; +defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>; +defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>; +defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>; +} } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; defm V_FREXP_EXP_I16_F16 : VOP1Inst_t16_with_profiles <"v_frexp_exp_i16_f16", @@ -785,6 +803,9 @@ let SubtargetPredicate = isGFX1250Plus in { def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_fp8, V_CVT_F16_FP8_fake16_e64, 1>; def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_bf8, V_CVT_F16_BF8_fake16_e64, 1>; } + + defm V_SAT_PK4_I4_I8 : VOP1Inst_t16<"v_sat_pk4_i4_i8", VOP1_I16_I32, int_amdgcn_sat_pk4_i4_i8>; + defm V_SAT_PK4_U4_U8 : VOP1Inst_t16<"v_sat_pk4_u4_u8", VOP1_I16_I32, int_amdgcn_sat_pk4_u4_u8>; } // End SubtargetPredicate = isGFX1250Plus let SubtargetPredicate = isGFX10Plus in { @@ -1062,6 +1083,13 @@ multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250< VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>; } +multiclass VOP1_Real_OpSelIsDPP_gfx1250<bits<9> op> : VOP1_Real_e32<GFX1250Gen, op> { + defvar ps = !cast<VOP_Pseudo>(NAME#"_e64"); + def _e64_gfx1250 : + VOP3_Real_Gen<ps, GFX1250Gen>, + VOP3OpSelIsDPP_gfx12<{0, 1, 1, op{6-0}}, ps.Pfl>; +} + defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Not12_50Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">; defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">; @@ -1127,11 +1155,25 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>; defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>; +defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>; +defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>; +defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>; +defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; +defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>; defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; +defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>; +defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>; defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>; defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>; defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>; +defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>; +defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>; +defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>; +defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>; +defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>; +defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>; +defm V_COS_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 25c6cbc3e1ab..030a6e1e978c 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -175,10 +175,14 @@ multiclass VOP2Inst_e64<string opName, def _e64 : VOP3InstBase <opName, P, node, 1>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; - let SubtargetPredicate = isGFX11Plus in { - if P.HasExtVOP3DPP then - def _e64_dpp : VOP3_DPP_Pseudo <opName, P>; - } // End SubtargetPredicate = isGFX11Plus + if P.HasExtVOP3DPP then + def _e64_dpp : VOP3_DPP_Pseudo <opName, P> { + let SubtargetPredicate = isGFX11Plus; + } + else if P.HasExt64BitDPP then + def _e64_dpp : VOP3_DPP_Pseudo <opName, P> { + let OtherPredicates = [HasDPALU_DPP]; + } } multiclass VOP2Inst_e64_VOPD<string opName, @@ -1492,7 +1496,9 @@ class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, VOP2_DPP<op, ps, opName, p, 1> { let AssemblerPredicate = HasDPP16; let SubtargetPredicate = ps.SubtargetPredicate; - let OtherPredicates = ps.OtherPredicates; + let OtherPredicates = !listconcat(ps.OtherPredicates, + !if(p.HasExt64BitDPP, [HasDPALU_DPP], []), + !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], [])); } class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, int subtarget, @@ -1832,6 +1838,9 @@ let SubtargetPredicate = isGFX12Plus in { V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx12, "v_subrev_co_ci_u32">; } // End SubtargetPredicate = isGFX12Plus +let SubtargetPredicate = HasFmacF64Inst in +defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>; + defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>; defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 75c531913ded..2e7f25b67fb6 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -224,12 +224,6 @@ defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32", fshr, null_frag>; defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>; - -// In gfx9 and 10, opsel is allowed for V_ALIGNBIT_B32 and V_ALIGNBYTE_B32. -// Hardware uses opsel[1:0] to byte-select src2. Other opsel bits are ignored. -defm V_ALIGNBIT_B32_opsel : VOP3Inst <"v_alignbit_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>; -defm V_ALIGNBYTE_B32_opsel : VOP3Inst <"v_alignbyte_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>; - let True16Predicate = UseRealTrue16Insts in defm V_ALIGNBYTE_B32_t16 : VOP3Inst <"v_alignbyte_b32_t16", VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>>; let True16Predicate = UseFakeTrue16Insts in @@ -1960,9 +1954,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" -defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14e, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">; -defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14f, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">; - defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>; let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in { @@ -2113,8 +2104,8 @@ defm V_BFI_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14a>; defm V_FMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x14b>; defm V_FMA_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x14c>; defm V_LERP_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x14d>; -defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7<0x14e>; -defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7<0x14f>; +defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14e>; +defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14f>; defm V_MULLIT_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x150>; defm V_MIN3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x151>; defm V_MIN3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x152>; @@ -2257,17 +2248,6 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0> } } -// Instructions such as v_alignbyte_b32 allows op_sel in gfx9, but not in vi. -// The following is created to support that. -multiclass VOP3OpSel_Real_gfx9_with_name<bits<10> op, string opName, string AsmName> { - defvar psName = opName#"_e64"; - def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(psName), SIEncodingFamily.VI>, // note: encoding family is VI - VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(psName).Pfl> { - VOP3_Pseudo ps = !cast<VOP3_Pseudo>(psName); - let AsmString = AsmName # ps.AsmOperands; - } -} - } // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>; @@ -2287,10 +2267,8 @@ defm V_BFI_B32 : VOP3_Real_vi <0x1ca>; defm V_FMA_F32 : VOP3_Real_vi <0x1cb>; defm V_FMA_F64 : VOP3_Real_vi <0x1cc>; defm V_LERP_U8 : VOP3_Real_vi <0x1cd>; -let SubtargetPredicate = isGFX8Only in { defm V_ALIGNBIT_B32 : VOP3_Real_vi <0x1ce>; defm V_ALIGNBYTE_B32 : VOP3_Real_vi <0x1cf>; -} defm V_MIN3_F32 : VOP3_Real_vi <0x1d0>; defm V_MIN3_I32 : VOP3_Real_vi <0x1d1>; defm V_MIN3_U32 : VOP3_Real_vi <0x1d2>; @@ -2335,9 +2313,6 @@ defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16" defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; -defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1ce, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">; -defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1cf, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">; - defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">; defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">; defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 31997f803dfc..e51e9574f8de 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1223,6 +1223,8 @@ class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> { Instruction Opcode2Addr = TwoAddr; Instruction Opcode3Addr = ThreeAddr; Predicate WaveSizePredicate; + Predicate SubtargetPredicate; + field bit is_wmma_xdl; } def WMMAOpcode : GenericEnum { @@ -1315,28 +1317,39 @@ let WaveSizePredicate = isWave64 in { } class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, - bit _IsIU, bit _IsFP8BF8> + bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0, + bit _HasMatrixReuse = 0, bit _IsF4 = 0> : VOP3P_Profile<VOPProfile<ArgTy>> { bit IsIU = _IsIU; - bit IsFP8BF8 = _IsFP8BF8; - bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8)); + bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B + bit IsXF32 = !and(_IsFP8BF8XF32, !eq(ArgTy[1], v8f32)); int IndexType = _IndexType; + let HasMatrixReuse = _HasMatrixReuse; + bit HasIModOp = _Has_ImodOp; + let HasClamp = !and(IsIU, !not(HasIModOp)); let IsPacked = 1; let IsWMMA = !not(_IsSWMMAC); let IsSWMMAC = _IsSWMMAC; - bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP); - bit IsAB_BF16 = !and(IsF16BF16, isIntType<ArgTy[1]>.ret); + bit IsAB_F64 = !or(!eq(ArgTy[1], v2f64), !eq(ArgTy[1], v4f64)); + bit IsAB_F32 = !eq(ArgTy[1], v2f32); + bit IsAB_F16 = !or(!eq(ArgTy[1], v16f16), !eq(ArgTy[1], v8f16), !eq(ArgTy[1], v4f16)); + bit IsAB_BF16 = !or(!eq(ArgTy[1], v16i16), !eq(ArgTy[1], v8i16), !eq(ArgTy[1], v4i16), + !eq(ArgTy[1], v16bf16), !eq(ArgTy[1], v8bf16), !eq(ArgTy[1], v4bf16)); + bit IsF16BF16 = !or(IsAB_F16, IsAB_BF16); + + bit IsC_F64 = !eq(ArgTy[3], v8f64); bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32)); - bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16)); + bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16), + !eq(ArgTy[3], v8bf16), !eq(ArgTy[3], v4bf16)); bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16)); - bit NegLo01 = !or(IsF16BF16, IsIU); - bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA); - bit NegHi01 = IsF16BF16; - bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA); + bit NegLo01 = !not(NoABMods); + bit NegLo2 = !and(!not(IsIU), !not(IsXF32), IsWMMA); + bit NegHi01 = IsF16BF16; // Only F16BF16 can have neg_hi[0:1] + bit NegHi2 = !and(!not(IsIU), !not(IsXF32), IsWMMA); bit NegLoAny = !or(NegLo01, NegLo2); bit NegHiAny = !or(NegHi01, NegHi2); @@ -1345,19 +1358,29 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, let Src1RC64 = !cast<RegisterOperand>("VRegSrc_"#ArgTy[2].Size); let Src2RC64 = !if(IsSWMMAC, DstRC, !cast<RegisterOperand>("VISrc_"#ArgTy[3].Size# - !cond(IsC_F32: "_f32", - IsC_F16: "_f16", + !cond(IsC_F64: "_f64", + IsC_F32: "_f32", + IsC_F16: "_f16", IsC_BF16: "_bf16", 1: "_b32"))); // For f16 and bf16 matrices A and B, each element can be modified by - // fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is + // fneg(neg_lo,neg_hi = 1). For f32 and f64, neg_lo[0:1] is allowed, but + // neg_hi[0:1] is ignored. For iu4 and iu8 matrices A and B neg_lo is // overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext) - // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each - // element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1). + // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16, f32 and f64 matrix C + // each element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1). // Opcode | src0/src1 - matrix A/B | src2 - matrix C or Index // --------------------------------------------------------------------------- + // wmma f64_f64 | neg_lo for neg A/B | neg_lo = 1 neg C(f64) + // | neg_hi ignored | neg_hi = 1 abs C(f64) + // --------------------------------------------------------------------------- + // wmma f32_f32 | neg_lo for neg A/B | neg_lo = 1 neg C(f32) + // | neg_hi ignored | neg_hi = 1 abs C(f32) + // --------------------------------------------------------------------------- + // wmma f32_xf32 | not allowed for xf32 | not allowed + // --------------------------------------------------------------------------- // wmma f32_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f32) // wmma f32_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f32) // --------------------------------------------------------------------------- @@ -1368,7 +1391,10 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, // | neg_lo = 1 i4/i8(sext) | i32 matrices // --------------------------------------------------------------------------- // wmma f32_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f32) - // (4 instructions) | f8 and bf8 matrices | neg_hi = 1 abs C(f32) + // | fp8 and bf8 matrices | neg_hi = 1 abs C(f32) + // --------------------------------------------------------------------------- + // wmma f16_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f16) + // | fp8 and bf8 matrices | neg_hi = 1 abs C(f16) // --------------------------------------------------------------------------- // swmmac f32_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix // swmmac f32_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst @@ -1380,103 +1406,153 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, // | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst // --------------------------------------------------------------------------- // swmmac f32_fp8/bf8 | not allowed for | not allowed for sparse matrix - // (4 instructions) | f8 and bf8 matrices | A Index - matrix C is in dst + // swmmac f16_fp8/bf8 | f8 and bf8 matrices | A Index - matrix C is in dst + // --------------------------------------------------------------------------- // pseudo - // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16 + // fp8bf8 and xf32 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16 // use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers, // remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32 // f16 or bf16). swmmac use index_key and don't use src 2 modifiers. - - dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers)); - dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers)); - dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers)); + dag Src0Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src0_modifiers)); + dag Src1Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src1_modifiers)); + dag Src2Mods = !if(!or(IsIU, IsXF32, IsSWMMAC), (ins), (ins PackedF16InputMods:$src2_modifiers)); dag IndexKey = !cond(!eq(IndexType, 0) : (ins), !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit), - !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit)); - dag Clamp = !if(IsIU, (ins Clamp0:$clamp), (ins)); + !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit), + !eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit)); + + dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins)); + dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins)); dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi), !and(NegLoAny, !not(NegHiAny)) : (ins neg_lo0:$neg_lo), !and(!not(NegLoAny), !not(NegHiAny)) : (ins)); let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1), !cond(IsWMMA : !con(Src2Mods, (ins Src2RC64:$src2)), - IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)), - Clamp, Neg); + IsSWMMAC : !con((ins DstRC:$srcTiedDef), + !if(!eq(IndexType, 32), + (ins VRegSrc_64:$src2), + (ins VRegSrc_32:$src2)), + IndexKey)), + MatrixReuse, Clamp, Neg); // asm string IndexKeyAsm = !cond(!eq(IndexType, 0) : "", !eq(IndexType, 8) : "$index_key_8bit", - !eq(IndexType, 16) : "$index_key_16bit"); - string ClampAsm = !if(IsIU, "$clamp", ""); + !eq(IndexType, 16) : "$index_key_16bit", + !eq(IndexType, 32) : "$index_key_32bit"); + string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", ""); + string ClampAsm = !if(HasClamp, "$clamp", ""); string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi", !and(NegLoAny, !not(NegHiAny)) : "$neg_lo", !and(!not(NegLoAny), !not(NegHiAny)) : ""); - let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm; + let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixReuseAsm#NegAsm#ClampAsm; // isel patterns + bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp)); + bit IsAB_F16_IMod0 = !and(IsAB_F16, !not(HasIModOp)); + bit IsAB_F32F64_IMod1 = !and(!or(IsAB_F64, IsAB_F32), HasIModOp); + bit IsAB_F16BF16_IMod1 = !and(!or(IsAB_F16, IsAB_BF16), HasIModOp); + dag Src0InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), + IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src0_modifiers), Src0VT:$src0), + IsAB_F16_IMod0 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))), + IsAB_BF16_IMod0 : (ins Src0VT:$src0), + IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), + NoABMods : (ins Src0VT:$src0)); + dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0), + IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0), + IsAB_F16_IMod0 : (ins i32:$src0_modifiers, Src0VT:$src0), + IsAB_BF16_IMod0 : (ins (i32 8), Src0VT:$src0), + IsIU : (ins i32:$src0_modifiers, Src0VT:$src0), + NoABMods : (ins Src0VT:$src0)); + dag Src1InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), + IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src1_modifiers), Src1VT:$src1), + IsAB_F16_IMod0 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))), + IsAB_BF16_IMod0 : (ins Src1VT:$src1), + IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), + NoABMods : (ins Src1VT:$src1)); + dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1), + IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1), + IsAB_F16_IMod0 : (ins i32:$src1_modifiers, Src1VT:$src1), + IsAB_BF16_IMod0 : (ins (i32 8), Src1VT:$src1), + IsIU : (ins i32:$src1_modifiers, Src1VT:$src1), + NoABMods : (ins Src1VT:$src1)); + bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU), !not(IsXF32)); + bit IsC_F32_IMod0 = !and(IsC_F32, !not(HasIModOp)); + bit IsC_F16_IMod0 = !and(IsC_F16, !not(HasIModOp)); + bit IsC_BF16_IMod0 = !and(IsC_BF16, !not(HasIModOp)); + bit IsIUXF32 = !or(IsIU, IsXF32); + dag Src2InPatWmma = !cond(IsC_IMod1 : (ins (VOP3PModsNegAbs i32:$src2_modifiers), Src2VT:$src2), + IsC_F32_IMod0 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))), + IsC_F16_IMod0 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))), + IsC_BF16_IMod0 : (ins Src2VT:$src2), + IsIUXF32 : (ins Src2VT:$src2), + IsSWMMAC : (ins)); + dag Src2OutPatWmma = !cond(IsC_IMod1 : (ins i32:$src2_modifiers, Src2VT:$src2), + IsC_F32_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2), + IsC_F16_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2), + IsC_BF16_IMod0 : (ins (i32 8), Src2VT:$src2), + IsIUXF32 : (ins Src2VT:$src2), + IsSWMMAC : (ins)); + dag ClampPat = !if(HasClamp, (ins i1:$clamp), (ins)); - dag Src0InPat = !cond(IsAB_F16 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))), - IsAB_BF16 : (ins Src0VT:$src0), - IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), - IsFP8BF8 : (ins Src0VT:$src0)); - dag Src0OutPat = !cond(IsAB_F16 : (ins i32:$src0_modifiers, Src0VT:$src0), - IsAB_BF16 : (ins (i32 8), Src0VT:$src0), - IsIU : (ins i32:$src0_modifiers, Src0VT:$src0), - IsFP8BF8 : (ins Src0VT:$src0)); - dag Src1InPat = !cond(IsAB_F16 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))), - IsAB_BF16 : (ins Src1VT:$src1), - IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), - IsFP8BF8 : (ins Src1VT:$src1)); - dag Src1OutPat = !cond(IsAB_F16 : (ins i32:$src1_modifiers, Src1VT:$src1), - IsAB_BF16 : (ins (i32 8), Src1VT:$src1), - IsIU : (ins i32:$src1_modifiers, Src1VT:$src1), - IsFP8BF8 : (ins Src1VT:$src1)); - dag Src2InPatWmma = !cond(IsC_F32 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))), - IsC_F16 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))), - IsC_BF16 : (ins Src2VT:$src2), - IsIU : (ins Src2VT:$src2), - IsSWMMAC : (ins)); - dag Src2OutPatWmma = !cond(IsC_F32 : (ins i32:$src2_modifiers, Src2VT:$src2), - IsC_F16 : (ins i32:$src2_modifiers, Src2VT:$src2), - IsC_BF16 : (ins (i32 8), Src2VT:$src2), - IsIU : (ins Src2VT:$src2), - IsSWMMAC : (ins)); - dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins)); dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2), !eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))), - !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit)))); + !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))), + !eq(IndexType, 32): (ins (i64 (SWMMACIndex32 i64:$src2, i32:$index_key_32bit)))); dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2), !eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit), - !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit)); - dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2))); - dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2)); + !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit), + !eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit)); + dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2)))); + dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2)); + dag MatrixReuseInPat = !if(HasMatrixReuse, (ins timm:$matrix_a_reuse, timm:$matrix_b_reuse), (ins)); + dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins)); - dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat); - dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat); + dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat); + dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixReuseOutModPat, ClampPat); - dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat); - dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat); + dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat); + dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat); // wmma pattern where src2 is inline imm uses _threeaddr pseudo, // can't use _twoaddr since it would violate src2 tied to vdst constraint. - dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, ClampPat); - dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat); + dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat); + dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixReuseOutModPat, ClampPat); } -multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> { +def WMMAInstInfoTable : GenericTable { + let FilterClass = "WMMAInstInfo"; + let CppTypeName = "WMMAInstInfo"; + let Fields = ["Opcode", "is_wmma_xdl"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getWMMAInstInfoHelper"; +} + +class WMMAInstInfo { + Instruction Opcode = !cast<Instruction>(NAME); + bit is_wmma_xdl = 0; +} + +multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix, bit DiffVdstSrc2 = 0> { + + defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2"); + defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { - let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in - def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{ + let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in + def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; } - let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in - def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>{ + let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in + def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; } @@ -1486,7 +1562,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse } multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> { - def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{ + def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let Mnemonic = Instr; let PseudoInstr = Instr#PseudoInstrSuffix; let mayRaiseFPException = 0; @@ -1556,6 +1632,76 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, // *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored // for matrix A, index is i16; Matrix B uses all lanes +def F64_F64X4_WMMA_w32 : VOP3PWMMA_Profile<[v8f64, v2f64, v2f64, v8f64], 0, 0, 0, 0, 1>; +def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 1>; +def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>; +def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 1>; +def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 1>; +def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 1>; +def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>; +def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1>; +def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1>; +def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 1>; +def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 1>; +def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 1>; +def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 1>; +def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 1>; +def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 1>; +def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 1>; +def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 1>; +def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 1>; +def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 1>; +def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 1>; + +let WaveSizePredicate = isWave32 in { +let SubtargetPredicate = isGFX125xOnly in { +defm V_WMMA_F32_16X16X4_F32_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x4_f32", F32_F32_WMMA_w32, "_w32">; + +let is_wmma_xdl = 1 in { +defm V_WMMA_F32_16X16X32_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x32_bf16", F32_BF16X32_WMMA_w32, "_w32">; +defm V_WMMA_BF16_16X16X32_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16_16x16x32_bf16", BF16_BF16X32_WMMA_w32, "_w32">; +defm V_WMMA_BF16F32_16X16X32_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16f32_16x16x32_bf16", BF16F32_BF16_WMMA_w32, "_w32", 1>; +defm V_WMMA_F32_16X16X64_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_fp8_fp8", F32_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X64_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_fp8_bf8", F32_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X64_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_bf8_fp8", F32_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X64_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_bf8_bf8", F32_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X64_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_fp8_fp8", F16_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X64_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_fp8_bf8", F16_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X64_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_bf8_fp8", F16_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X64_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_bf8_bf8", F16_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_I32_16X16X64_IU8_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x64_iu8", I32_IU8X64_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X32_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x32_f16", F32_F16X32_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X32_F16_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x32_f16", F16_F16X32_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X128_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_fp8_fp8", F16_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X128_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_fp8_bf8", F16_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_bf8_fp8", F16_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_bf8_bf8", F16_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X128_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_fp8_fp8", F32_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X128_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_fp8_bf8", F32_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X128_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_bf8_fp8", F32_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X128_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_bf8_bf8", F32_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_f32_32x16x128_f4", F32_32X16X128_F4_WMMA_w32, "_w32">; + +defm V_SWMMAC_F32_16X16X64_BF16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_bf16", F32_BF16X64_SWMMAC_w32, "_w32">; +defm V_SWMMAC_BF16_16X16X64_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x64_bf16", BF16_BF16X64_SWMMAC_w32, "_w32">; +defm V_SWMMAC_BF16F32_16X16X64_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16f32_16x16x64_bf16", F32_BF16X64_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X128_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_fp8_fp8", F32_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X128_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_fp8_bf8", F32_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X128_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_bf8_fp8", F32_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X128_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_bf8_bf8", F32_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F16_16X16X128_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_fp8_fp8", F16_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F16_16X16X128_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_fp8_bf8", F16_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F16_16X16X128_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_bf8_fp8", F16_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F16_16X16X128_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_bf8_bf8", F16_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_I32_16X16X128_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x128_iu8", I32_IU8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_f16", F32_F16X64_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16", F16_F16X64_SWMMAC_w32, "_w32">; + +} // End is_wmma_xdl = 1. + +} // End SubtargetPredicate = isGFX125xOnly +} // End WaveSizePredicate = isWave32 + let WaveSizePredicate = isWave32 in { defm V_WMMA_F32_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w32, "_w32">; defm V_WMMA_F32_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w32, "_w32">; @@ -1628,7 +1774,7 @@ class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile let WaveSizePredicate = isWave64; } -let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in { +let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12PlusNot12_50 in { defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w32>; defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>; defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>; @@ -1655,7 +1801,7 @@ let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in { def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>; } -let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in { +let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in { defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>; defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>; defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>; @@ -1681,6 +1827,49 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in { def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w64>; } +let WaveSizePredicate = isWave32 in { +let SubtargetPredicate = isGFX125xOnly in { + defm : WMMAPat<"V_WMMA_F32_16X16X4_F32_w32", int_amdgcn_wmma_f32_16x16x4_f32, F32_F32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X32_BF16_w32", int_amdgcn_wmma_f32_16x16x32_bf16, F32_BF16X32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_BF16_16X16X32_BF16_w32", int_amdgcn_wmma_bf16_16x16x32_bf16, BF16_BF16X32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_BF16F32_16X16X32_BF16_w32", int_amdgcn_wmma_bf16f32_16x16x32_bf16, BF16F32_BF16_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X64_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x64_fp8_fp8, F32_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X64_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x64_fp8_bf8, F32_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X64_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x64_bf8_fp8, F32_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X64_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x64_bf8_bf8, F32_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X64_FP8_FP8_w32", int_amdgcn_wmma_f16_16x16x64_fp8_fp8, F16_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X64_FP8_BF8_w32", int_amdgcn_wmma_f16_16x16x64_fp8_bf8, F16_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X64_BF8_FP8_w32", int_amdgcn_wmma_f16_16x16x64_bf8_fp8, F16_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X64_BF8_BF8_w32", int_amdgcn_wmma_f16_16x16x64_bf8_bf8, F16_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_I32_16X16X64_IU8_w32", int_amdgcn_wmma_i32_16x16x64_iu8, I32_IU8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X32_F16_w32", int_amdgcn_wmma_f32_16x16x32_f16, F32_F16X32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X32_F16_w32", int_amdgcn_wmma_f16_16x16x32_f16, F16_F16X32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X128_FP8_FP8_w32", int_amdgcn_wmma_f16_16x16x128_fp8_fp8, F16_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X128_FP8_BF8_w32", int_amdgcn_wmma_f16_16x16x128_fp8_bf8, F16_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f16_16x16x128_bf8_fp8, F16_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f16_16x16x128_bf8_bf8, F16_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X128_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_fp8_fp8, F32_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X128_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_fp8_bf8, F32_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>; + + def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16_16x16x64_bf16, BF16_BF16X64_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_fp8_fp8, F32_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_fp8_bf8, F32_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_fp8_fp8, F16_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_fp8_bf8, F16_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_bf8_fp8, F16_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_bf8_bf8, F16_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr, int_amdgcn_swmmac_i32_16x16x128_iu8, I32_IU8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F32_16X16X64_F16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_f16, F32_F16X64_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F16_16X16X64_F16_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x64_f16, F16_F16X64_SWMMAC_w32>; +} // End SubtargetPredicate = isGFX125xOnly +} // End WaveSizePredicate = isWave32 //===----------------------------------------------------------------------===// // Begin Real Encodings @@ -1726,13 +1915,14 @@ class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP> // opsel let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0, !eq(WMMAP.IndexType, 8) : index_key_8bit{0}, - !eq(WMMAP.IndexType, 16) : index_key_16bit{0}); + !eq(WMMAP.IndexType, 16) : index_key_16bit{0}, + !eq(WMMAP.IndexType, 32) : index_key_32bit{0}); let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0); - let Inst{13} = 0; + let Inst{13} = !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0); // opsel_hi let Inst{59} = 1; let Inst{60} = 1; - let Inst{14} = 1; + let Inst{14} = !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1); // neg_lo let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0); let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0); @@ -1742,7 +1932,7 @@ class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP> let Inst{9} = !if(WMMAP.NegHi01, src1_modifiers{1}, 0); let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0); // clamp - let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0); + let Inst{15} = !if(WMMAP.HasClamp, clamp{0}, 0); } multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<8> op, VOP3PWMMA_Profile WMMAP, @@ -1765,6 +1955,12 @@ multiclass VOP3P_Real_WMMA_gfx12w64 <bits<8> op, VOP3PWMMA_Profile WMMAP> { } } +multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> { + let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in { + defm _twoaddr : VOP3P_WMMA_Real_Base <GFX1250Gen, op, WMMAP>; + } +} + defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>; defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>; defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>; @@ -1814,6 +2010,46 @@ defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>; defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>; +defm V_WMMA_F32_16X16X4_F32_w32 : VOP3P_Real_WMMA_gfx1250 <0x05d, F32_F32_WMMA_w32>; +defm V_WMMA_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x062, F32_BF16X32_WMMA_w32>; +defm V_WMMA_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x060, F32_F16X32_WMMA_w32>; +defm V_WMMA_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x061, F16_F16X32_WMMA_w32>; +defm V_WMMA_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x063, BF16_BF16X32_WMMA_w32>; +defm V_WMMA_BF16F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x064, BF16F32_BF16_WMMA_w32>; +defm V_WMMA_F32_16X16X64_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06a, F32_FP8BF8X64_WMMA_w32>; +defm V_WMMA_F32_16X16X64_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06b, F32_FP8BF8X64_WMMA_w32>; +defm V_WMMA_F32_16X16X64_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06c, F32_FP8BF8X64_WMMA_w32>; +defm V_WMMA_F32_16X16X64_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06d, F32_FP8BF8X64_WMMA_w32>; +defm V_WMMA_F16_16X16X64_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06e, F16_FP8BF8X64_WMMA_w32>; +defm V_WMMA_F16_16X16X64_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06f, F16_FP8BF8X64_WMMA_w32>; +defm V_WMMA_F16_16X16X64_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x070, F16_FP8BF8X64_WMMA_w32>; +defm V_WMMA_F16_16X16X64_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x071, F16_FP8BF8X64_WMMA_w32>; +defm V_WMMA_I32_16X16X64_IU8_w32 : VOP3P_Real_WMMA_gfx1250 <0x072, I32_IU8X64_WMMA_w32>; +defm V_WMMA_F32_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x080, F32_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F32_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x081, F32_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F32_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x082, F32_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F32_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x083, F32_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F16_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x084, F16_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F16_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x085, F16_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x086, F16_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>; + +defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>; +defm V_SWMMAC_F16_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>; +defm V_SWMMAC_BF16_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x068, BF16_BF16X64_SWMMAC_w32>; +defm V_SWMMAC_BF16F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x069, F32_BF16X64_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x073, F32_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x074, F32_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x075, F32_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x076, F32_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_F16_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x077, F16_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_F16_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x078, F16_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x079, F16_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x07a, F16_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_I32_16X16X128_IU8_w32 : VOP3P_Real_WMMA_gfx1250 <0x07b, I32_IU8X128_SWMMAC_w32>; + multiclass VOP3P_Real_with_name<GFXGen Gen, bits<8> op, string backing_ps_name = NAME, string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index df215d23f7f4..a25ebdf3e5f6 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -331,10 +331,19 @@ class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { // Special case for v_permlane16_swap_b32/v_permlane32_swap_b32 // op_sel[0]/op_sel[1] are treated as bound_ctrl and fi dpp operands. -class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { +class VOP3OpSelIsDPP_base { bits<1> fi; bits<1> bound_ctrl; +} + +class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_vi <op, P> { + // OPSEL[0] specifies FI + let Inst{11} = fi; + // OPSEL[1] specifies BOUND_CTRL + let Inst{12} = bound_ctrl; +} +class VOP3OpSelIsDPP_gfx12 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_gfx11_gfx12 <op, P> { // OPSEL[0] specifies FI let Inst{11} = fi; // OPSEL[1] specifies BOUND_CTRL @@ -432,7 +441,7 @@ class VOP3be <VOPProfile P> : Enc64 { let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); } -class VOP3Pe <VOPProfile P> : Enc64 { +class VOP3Pe_Base { bits<8> vdst; bits<4> src0_modifiers; bits<9> src0; @@ -443,7 +452,12 @@ class VOP3Pe <VOPProfile P> : Enc64 { bits<1> clamp; bits<2> index_key_8bit; bits<1> index_key_16bit; + bits<1> index_key_32bit; + bits<1> matrix_a_reuse; + bits<1> matrix_b_reuse; +} +class VOP3Pe <VOPProfile P> : Enc64, VOP3Pe_Base { let Inst{7-0} = !if(P.HasDst, vdst, 0); let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1 @@ -451,9 +465,13 @@ class VOP3Pe <VOPProfile P> : Enc64 { let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0) let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1) - let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2) + let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, + !if(P.HasMatrixReuse, matrix_a_reuse, 0)); // op_sel(2) - let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(2) + let Inst{14} = !cond(!and(P.HasSrc2, P.HasOpSel) : src2_modifiers{3}, + P.IsDOT : 1, + P.HasMatrixReuse : matrix_b_reuse, + 1: ?); // op_sel_hi(2) let Inst{15} = !if(P.HasClamp, clamp{0}, 0); |
