summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorPeter Collingbourne <peter@pcc.me.uk>2025-07-18 13:26:00 -0700
committerPeter Collingbourne <peter@pcc.me.uk>2025-07-18 13:26:00 -0700
commit9bf3524731070cadc6175707314f3b6ca37190d5 (patch)
tree86dcab7604336b01ae938fe81062c29ff69efba8 /llvm/lib/Target/AMDGPU
parent3a84c15cc13b6daf8e812592898ab6c7f19091a9 (diff)
parent4f43f0606c3d7e1ce6d069583b5e59f036e112ce (diff)
Created using spr 1.3.6-beta.1
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td33
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp294
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp119
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp90
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp93
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp60
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp108
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp41
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp234
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp19
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp42
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td504
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp46
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h1
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h10
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp20
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp266
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp508
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp258
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h29
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td14
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td73
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td1
-rw-r--r--llvm/lib/Target/AMDGPU/SISchedule.td21
-rw-r--r--llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h9
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td42
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td19
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td29
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td392
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td26
63 files changed, 2258 insertions, 1353 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 23f106a9c1d4..007b481f8496 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -153,6 +153,9 @@ private:
const TargetMachine &TM;
};
+void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &);
+extern char &AMDGPUPrepareAGPRAllocLegacyID;
+
void initializeAMDGPUReserveWWMRegsLegacyPass(PassRegistry &);
extern char &AMDGPUReserveWWMRegsLegacyID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 31420caca089..0e0e83b7a6b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -89,6 +89,12 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch",
"Use scratch_* flat memory instructions to access scratch"
>;
+def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode",
+ "FlatGVSMode",
+ "true",
+ "Have GVS addressing mode with flat_* instructions"
+>;
+
def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
"AddNoCarryInsts",
"true",
@@ -541,6 +547,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
"Use true 16-bit registers"
>;
+def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
+ "HasBF16TransInsts",
+ "true",
+ "Has bf16 transcendental instructions"
+>;
+
def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts",
"HasBF16ConversionInsts",
"true",
@@ -1106,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts",
"Has v_bitop3_b32/v_bitop3_b16 instructions"
>;
+def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
+ "HasTanhInsts",
+ "true",
+ "Has v_tanh_f32/f16 instructions"
+>;
+
def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
"HasTransposeLoadF4F6Insts",
"true",
@@ -1948,6 +1966,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureShaderCyclesHiLoRegisters,
FeatureArchitectedFlatScratch,
FeatureArchitectedSGPRs,
+ FeatureFlatGVSMode,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
FeatureAtomicDsPkAdd16Insts,
@@ -1966,7 +1985,9 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureBitOp3Insts,
+ FeatureTanhInsts,
FeatureTransposeLoadF4F6Insts,
+ FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
FeatureCvtPkF16F32Inst,
FeatureMinimum3Maximum3PKF16,
@@ -2374,6 +2395,9 @@ def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">,
def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">,
AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>;
+def HasFlatGVSMode : Predicate<"Subtarget->hasFlatGVSMode()">,
+ AssemblerPredicate<(all_of FeatureFlatGVSMode)>;
+
def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">,
AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>;
@@ -2442,6 +2466,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
+def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
+ AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
+
def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">,
AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>;
@@ -2657,6 +2684,9 @@ def HasDefaultComponentBroadcast
def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
+def HasAddPC64Inst : Predicate<"Subtarget->hasAddPC64Inst()">,
+ AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+
def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">;
def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">;
@@ -2680,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
AssemblerPredicate<(all_of FeatureBitOp3Insts)>;
+def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
+ AssemblerPredicate<(all_of FeatureTanhInsts)>;
+
def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 79cf49f88d6d..dedee46a4423 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -13,11 +13,9 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/Analysis/CycleAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
-#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/IPO/Attributor.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 22b921fb2084..5f1983791cfa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -45,12 +45,6 @@ static cl::opt<bool> WidenLoads(
cl::ReallyHidden,
cl::init(false));
-static cl::opt<bool> Widen16BitOps(
- "amdgpu-codegenprepare-widen-16-bit-ops",
- cl::desc(
- "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
- cl::ReallyHidden, cl::init(false));
-
static cl::opt<bool>
BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
cl::desc("Break large PHI nodes for DAGISel"),
@@ -150,18 +144,6 @@ public:
bool canBreakPHINode(const PHINode &I);
- /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
- /// binary operation \p V.
- ///
- /// \returns Binary operation \p V.
- /// \returns \p T's base element bit width.
- unsigned getBaseElementBitWidth(const Type *T) const;
-
- /// \returns Equivalent 32 bit integer type for given type \p T. For example,
- /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
- /// is returned.
- Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
-
/// \returns True if binary operation \p I is a signed binary operation, false
/// otherwise.
bool isSigned(const BinaryOperator &I) const;
@@ -170,10 +152,6 @@ public:
/// signed 'icmp' operation, false otherwise.
bool isSigned(const SelectInst &I) const;
- /// \returns True if type \p T needs to be promoted to 32 bit integer type,
- /// false otherwise.
- bool needsPromotionToI32(const Type *T) const;
-
/// Return true if \p T is a legal scalar floating point type.
bool isLegalFloatingTy(const Type *T) const;
@@ -188,52 +166,6 @@ public:
computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal();
}
- /// Promotes uniform binary operation \p I to equivalent 32 bit binary
- /// operation.
- ///
- /// \details \p I's base element bit width must be greater than 1 and less
- /// than or equal 16. Promotion is done by sign or zero extending operands to
- /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
- /// truncating the result of 32 bit binary operation back to \p I's original
- /// type. Division operation is not promoted.
- ///
- /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
- /// false otherwise.
- bool promoteUniformOpToI32(BinaryOperator &I) const;
-
- /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
- ///
- /// \details \p I's base element bit width must be greater than 1 and less
- /// than or equal 16. Promotion is done by sign or zero extending operands to
- /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
- ///
- /// \returns True.
- bool promoteUniformOpToI32(ICmpInst &I) const;
-
- /// Promotes uniform 'select' operation \p I to 32 bit 'select'
- /// operation.
- ///
- /// \details \p I's base element bit width must be greater than 1 and less
- /// than or equal 16. Promotion is done by sign or zero extending operands to
- /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
- /// result of 32 bit 'select' operation back to \p I's original type.
- ///
- /// \returns True.
- bool promoteUniformOpToI32(SelectInst &I) const;
-
- /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
- /// intrinsic.
- ///
- /// \details \p I's base element bit width must be greater than 1 and less
- /// than or equal 16. Promotion is done by zero extending the operand to 32
- /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
- /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
- /// shift amount is 32 minus \p I's base element bit width), and truncating
- /// the result of the shift operation back to \p I's original type.
- ///
- /// \returns True.
- bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
-
/// \returns The minimum number of bits needed to store the value of \Op as an
/// unsigned integer. Truncating to this size and then zero-extending to
/// the original will not change the value.
@@ -320,13 +252,11 @@ public:
bool visitInstruction(Instruction &I) { return false; }
bool visitBinaryOperator(BinaryOperator &I);
bool visitLoadInst(LoadInst &I);
- bool visitICmpInst(ICmpInst &I);
bool visitSelectInst(SelectInst &I);
bool visitPHINode(PHINode &I);
bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
bool visitIntrinsicInst(IntrinsicInst &I);
- bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
bool visitFMinLike(IntrinsicInst &I);
bool visitSqrt(IntrinsicInst &I);
bool run();
@@ -380,22 +310,6 @@ bool AMDGPUCodeGenPrepareImpl::run() {
return MadeChange;
}
-unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
- assert(needsPromotionToI32(T) && "T does not need promotion to i32");
-
- if (T->isIntegerTy())
- return T->getIntegerBitWidth();
- return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
-}
-
-Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {
- assert(needsPromotionToI32(T) && "T does not need promotion to i32");
-
- if (T->isIntegerTy())
- return B.getInt32Ty();
- return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
-}
-
bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
return I.getOpcode() == Instruction::AShr ||
I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
@@ -406,59 +320,11 @@ bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
cast<ICmpInst>(I.getOperand(0))->isSigned();
}
-bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
- if (!Widen16BitOps)
- return false;
-
- const IntegerType *IntTy = dyn_cast<IntegerType>(T);
- if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
- return true;
-
- if (const VectorType *VT = dyn_cast<VectorType>(T)) {
- // TODO: The set of packed operations is more limited, so may want to
- // promote some anyway.
- if (ST.hasVOP3PInsts())
- return false;
-
- return needsPromotionToI32(VT->getElementType());
- }
-
- return false;
-}
-
bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
return Ty->isFloatTy() || Ty->isDoubleTy() ||
(Ty->isHalfTy() && ST.has16BitInsts());
}
-// Return true if the op promoted to i32 should have nsw set.
-static bool promotedOpIsNSW(const Instruction &I) {
- switch (I.getOpcode()) {
- case Instruction::Shl:
- case Instruction::Add:
- case Instruction::Sub:
- return true;
- case Instruction::Mul:
- return I.hasNoUnsignedWrap();
- default:
- return false;
- }
-}
-
-// Return true if the op promoted to i32 should have nuw set.
-static bool promotedOpIsNUW(const Instruction &I) {
- switch (I.getOpcode()) {
- case Instruction::Shl:
- case Instruction::Add:
- case Instruction::Mul:
- return true;
- case Instruction::Sub:
- return I.hasNoUnsignedWrap();
- default:
- return false;
- }
-}
-
bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
Type *Ty = I.getType();
int TySize = DL.getTypeSizeInBits(Ty);
@@ -467,134 +333,6 @@ bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
}
-bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
- assert(needsPromotionToI32(I.getType()) &&
- "I does not need promotion to i32");
-
- if (I.getOpcode() == Instruction::SDiv ||
- I.getOpcode() == Instruction::UDiv ||
- I.getOpcode() == Instruction::SRem ||
- I.getOpcode() == Instruction::URem)
- return false;
-
- IRBuilder<> Builder(&I);
- Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
- Type *I32Ty = getI32Ty(Builder, I.getType());
- Value *ExtOp0 = nullptr;
- Value *ExtOp1 = nullptr;
- Value *ExtRes = nullptr;
- Value *TruncRes = nullptr;
-
- if (isSigned(I)) {
- ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
- ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
- } else {
- ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
- ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
- }
-
- ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
- if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
- if (promotedOpIsNSW(cast<Instruction>(I)))
- Inst->setHasNoSignedWrap();
-
- if (promotedOpIsNUW(cast<Instruction>(I)))
- Inst->setHasNoUnsignedWrap();
-
- if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
- Inst->setIsExact(ExactOp->isExact());
- }
-
- TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
-
- I.replaceAllUsesWith(TruncRes);
- I.eraseFromParent();
-
- return true;
-}
-
-bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
- assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
- "I does not need promotion to i32");
-
- IRBuilder<> Builder(&I);
- Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
- Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
- Value *ExtOp0 = nullptr;
- Value *ExtOp1 = nullptr;
- Value *NewICmp = nullptr;
-
- if (I.isSigned()) {
- ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
- ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
- } else {
- ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
- ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
- }
- NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
-
- I.replaceAllUsesWith(NewICmp);
- I.eraseFromParent();
-
- return true;
-}
-
-bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
- assert(needsPromotionToI32(I.getType()) &&
- "I does not need promotion to i32");
-
- IRBuilder<> Builder(&I);
- Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
- Type *I32Ty = getI32Ty(Builder, I.getType());
- Value *ExtOp1 = nullptr;
- Value *ExtOp2 = nullptr;
- Value *ExtRes = nullptr;
- Value *TruncRes = nullptr;
-
- if (isSigned(I)) {
- ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
- ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
- } else {
- ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
- ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
- }
- ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
- TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
-
- I.replaceAllUsesWith(TruncRes);
- I.eraseFromParent();
-
- return true;
-}
-
-bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
- IntrinsicInst &I) const {
- assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
- "I must be bitreverse intrinsic");
- assert(needsPromotionToI32(I.getType()) &&
- "I does not need promotion to i32");
-
- IRBuilder<> Builder(&I);
- Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
- Type *I32Ty = getI32Ty(Builder, I.getType());
- Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
- Value *ExtRes =
- Builder.CreateIntrinsic(Intrinsic::bitreverse, {I32Ty}, {ExtOp});
- Value *LShrOp =
- Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
- Value *TruncRes =
- Builder.CreateTrunc(LShrOp, I.getType());
-
- I.replaceAllUsesWith(TruncRes);
- I.eraseFromParent();
-
- return true;
-}
-
unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
return computeKnownBits(Op, DL, AC).countMaxActiveBits();
}
@@ -1635,10 +1373,6 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
if (foldBinOpIntoSelect(I))
return true;
- if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) &&
- UA.isUniform(&I) && promoteUniformOpToI32(I))
- return true;
-
if (UseMul24Intrin && replaceMulWithMul24(I))
return true;
if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(),
@@ -1770,16 +1504,6 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
return false;
}
-bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
- bool Changed = false;
-
- if (ST.has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
- UA.isUniform(&I))
- Changed |= promoteUniformOpToI32(I);
-
- return Changed;
-}
-
bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
Value *Cond = I.getCondition();
Value *TrueVal = I.getTrueValue();
@@ -1787,12 +1511,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
Value *CmpVal;
CmpPredicate Pred;
- if (ST.has16BitInsts() && needsPromotionToI32(I.getType())) {
- if (UA.isUniform(&I))
- return promoteUniformOpToI32(I);
- return false;
- }
-
// Match fract pattern with nan check.
if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
return false;
@@ -2196,8 +1914,6 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
switch (I.getIntrinsicID()) {
- case Intrinsic::bitreverse:
- return visitBitreverseIntrinsicInst(I);
case Intrinsic::minnum:
case Intrinsic::minimumnum:
case Intrinsic::minimum:
@@ -2209,16 +1925,6 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
}
}
-bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
- bool Changed = false;
-
- if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) &&
- UA.isUniform(&I))
- Changed |= promoteUniformBitreverseToI32(I);
-
- return Changed;
-}
-
/// Match non-nan fract pattern.
/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
/// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 1b909568fc55..7b5d4077e85f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -55,6 +55,14 @@ def gi_vop3pmodsneg :
GIComplexOperandMatcher<s32, "selectVOP3PModsNeg">,
GIComplexPatternEquiv<VOP3PModsNeg>;
+def gi_vop3pmodsnegs :
+ GIComplexOperandMatcher<s32, "selectVOP3PModsNegs">,
+ GIComplexPatternEquiv<VOP3PModsNegs>;
+
+def gi_dotiuvop3pmodsnegabs :
+ GIComplexOperandMatcher<s32, "selectVOP3PModsNegAbs">,
+ GIComplexPatternEquiv<VOP3PModsNegAbs>;
+
def gi_wmmaopselvop3pmods :
GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
@@ -83,6 +91,10 @@ def gi_swmmacindex16 :
GIComplexOperandMatcher<s32, "selectSWMMACIndex16">,
GIComplexPatternEquiv<SWMMACIndex16>;
+def gi_swmmacindex32 :
+ GIComplexOperandMatcher<s64, "selectSWMMACIndex32">,
+ GIComplexPatternEquiv<SWMMACIndex32>;
+
def gi_vop3opselmods :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
GIComplexPatternEquiv<VOP3OpSelMods>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 202693b31612..25672a52345c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -447,6 +447,35 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
return;
}
+ bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
+ if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
+ CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
+ uint64_t C = 0;
+ bool AllConst = true;
+ unsigned EltSize = EltVT.getSizeInBits();
+ for (unsigned I = 0; I < NumVectorElts; ++I) {
+ SDValue Op = N->getOperand(I);
+ if (Op.isUndef()) {
+ AllConst = false;
+ break;
+ }
+ uint64_t Val;
+ if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Op)) {
+ Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
+ } else
+ Val = cast<ConstantSDNode>(Op)->getZExtValue();
+ C |= Val << (EltSize * I);
+ }
+ if (AllConst) {
+ SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
+ MachineSDNode *Copy =
+ CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
+ CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
+ RegClass);
+ return;
+ }
+ }
+
assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
"supported yet");
// 32 = Max Num Vector Elements
@@ -454,7 +483,6 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
// 1 = Vector Register Class
SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
- bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
bool IsRegSeq = true;
unsigned NOps = N->getNumOperands();
@@ -676,7 +704,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::Constant:
case ISD::ConstantFP: {
- if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+ if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
+ Subtarget->has64BitLiterals())
break;
uint64_t Imm;
@@ -1632,8 +1661,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
SDValue &SRsrc,
SDValue &SOffset,
SDValue &Offset) const {
- const SIRegisterInfo *TRI =
- static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
MachineFunction &MF = CurDAG->getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
@@ -3245,6 +3273,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
return SelectVOP3PMods(In, Src, SrcMods, true);
}
+// Select neg_lo from the i1 immediate operand.
bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
const ConstantSDNode *C = cast<ConstantSDNode>(In);
// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
@@ -3260,6 +3289,47 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
return true;
}
+// Select both neg_lo and neg_hi from the i1 immediate operand. This is
+// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies
+// to matrix's even k elements, and neg_hi applies to matrix's odd k elements.
+bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegs(SDValue In, SDValue &Src) const {
+ const ConstantSDNode *C = cast<ConstantSDNode>(In);
+ // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
+ // 1 promotes packed values to signed, 0 treats them as unsigned.
+ assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
+
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ unsigned SrcSign = C->getZExtValue();
+ if (SrcSign == 1)
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+
+ Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+// Select neg, abs, or both neg and abs from the i16 immediate operans.
+bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const {
+ const ConstantSDNode *C = cast<ConstantSDNode>(In);
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ unsigned SrcMod = C->getZExtValue();
+ switch (SrcMod) {
+ default: // Any other value will be silently ignored (considered as 0).
+ break;
+ case 1:
+ Mods ^= SISrcMods::NEG;
+ break;
+ case 2:
+ Mods ^= SISrcMods::ABS;
+ break;
+ case 3:
+ Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
+ break;
+ }
+
+ Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
SDValue &Src) const {
const ConstantSDNode *C = cast<ConstantSDNode>(In);
@@ -3611,6 +3681,41 @@ bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
+ SDValue &IndexKey) const {
+ unsigned Key = 0;
+ Src = In;
+
+ SDValue InI32;
+
+ if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
+ const SDValue &ExtendSrc = In.getOperand(0);
+ if (ExtendSrc.getValueSizeInBits() == 32)
+ InI32 = ExtendSrc;
+ } else if (In->getOpcode() == ISD::BITCAST) {
+ const SDValue &CastSrc = In.getOperand(0);
+ if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
+ CastSrc.getOperand(0).getValueSizeInBits() == 32) {
+ ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));
+ if (Zero && Zero->getZExtValue() == 0)
+ InI32 = CastSrc.getOperand(0);
+ }
+ }
+
+ if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ const SDValue &ExtractVecEltSrc = InI32.getOperand(0);
+ ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));
+ if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
+ EltIdx->getZExtValue() == 1) {
+ Key = 1;
+ Src = ExtractVecEltSrc;
+ }
+ }
+
+ IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;
@@ -3885,10 +3990,8 @@ SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
- const SIRegisterInfo *SIRI =
- static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
- const SIInstrInfo * SII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+ const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
+ const SIInstrInfo *SII = Subtarget->getInstrInfo();
unsigned Limit = 0;
bool AllUsesAcceptSReg = true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index f3b9364fdb92..9967f46e085e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -222,6 +222,8 @@ private:
bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const;
+ bool SelectVOP3PModsNegs(SDValue In, SDValue &Src) const;
+ bool SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const;
bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
@@ -233,6 +235,7 @@ private:
bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const;
bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const;
+ bool SelectSWMMACIndex32(SDValue In, SDValue &Src, SDValue &IndexKey) const;
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e64d2162441a..3d040fb705a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4006,7 +4006,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
- case Intrinsic::amdgcn_rsq_clamp: {
+ case Intrinsic::amdgcn_rsq_clamp:
+ case Intrinsic::amdgcn_tanh: {
// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
SDValue Src = N->getOperand(1);
return Src.isUndef() ? Src : SDValue();
@@ -4842,11 +4843,94 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
}
+// Detect when CMP and SELECT use the same constant and fold them to avoid
+// loading the constant twice. Specifically handles patterns like:
+// %cmp = icmp eq i32 %val, 4242
+// %sel = select i1 %cmp, i32 4242, i32 %other
+// It can be optimized to reuse %val instead of 4242 in select.
+static SDValue
+foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const AMDGPUSubtarget *ST) {
+ SDValue Cond = N->getOperand(0);
+ SDValue TrueVal = N->getOperand(1);
+ SDValue FalseVal = N->getOperand(2);
+
+ // Check if condition is a comparison.
+ if (Cond.getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
+ bool isInteger = LHS.getValueType().isInteger();
+
+ // Handle simple floating-point and integer types only.
+ if (!isFloatingPoint && !isInteger)
+ return SDValue();
+
+ bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
+ bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
+ if (!isEquality && !isNonEquality)
+ return SDValue();
+
+ SDValue ArgVal, ConstVal;
+ if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
+ (isInteger && isa<ConstantSDNode>(RHS))) {
+ ConstVal = RHS;
+ ArgVal = LHS;
+ } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
+ (isInteger && isa<ConstantSDNode>(LHS))) {
+ ConstVal = LHS;
+ ArgVal = RHS;
+ } else {
+ return SDValue();
+ }
+
+ // Check if constant should not be optimized - early return if not.
+ if (isFloatingPoint) {
+ const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
+ const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST);
+
+ // Only optimize normal floating-point values (finite, non-zero, and
+ // non-subnormal as per IEEE 754), skip optimization for inlinable
+ // floating-point constants.
+ if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val))
+ return SDValue();
+ } else {
+ int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue();
+
+ // Skip optimization for inlinable integer immediates.
+ // Inlinable immediates include: -16 to 64 (inclusive).
+ if (IntVal >= -16 && IntVal <= 64)
+ return SDValue();
+ }
+
+ // For equality and non-equality comparisons, patterns:
+ // select (setcc x, const), const, y -> select (setcc x, const), x, y
+ // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
+ if (!(isEquality && TrueVal == ConstVal) &&
+ !(isNonEquality && FalseVal == ConstVal))
+ return SDValue();
+
+ SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
+ SDValue SelectRHS =
+ (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
+ return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
+ SelectLHS, SelectRHS);
+}
+
SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
return Folded;
+ // Try to fold CMP + SELECT patterns with shared constants (both FP and
+ // integer).
+ if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget))
+ return Folded;
+
SDValue Cond = N->getOperand(0);
if (Cond.getOpcode() != ISD::SETCC)
return SDValue();
@@ -5733,6 +5817,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
+ NODE_NAME_CASE(PC_ADD_REL_OFFSET64)
NODE_NAME_CASE(LDS)
NODE_NAME_CASE(DUMMY_CHAIN)
NODE_NAME_CASE(LOAD_D16_HI)
@@ -6196,7 +6281,8 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
- case Intrinsic::amdgcn_rsq_clamp: {
+ case Intrinsic::amdgcn_rsq_clamp:
+ case Intrinsic::amdgcn_tanh: {
if (SNaN)
return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 0dd2183b72b2..4e8c6c7ea3b2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -545,6 +545,7 @@ enum NodeType : unsigned {
/// Pointer to the start of the shader's constant data.
CONST_DATA_PTR,
PC_ADD_REL_OFFSET,
+ PC_ADD_REL_OFFSET64,
LDS,
DUMMY_CHAIN,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 44eaebffb70d..9a90787963d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -25,6 +25,7 @@ namespace {
class AMDGPUInsertDelayAlu {
public:
+ const GCNSubtarget *ST;
const SIInstrInfo *SII;
const TargetRegisterInfo *TRI;
@@ -65,13 +66,16 @@ public:
// Types of delay that can be encoded in an s_delay_alu instruction.
enum DelayType { VALU, TRANS, SALU, OTHER };
- // Get the delay type for an instruction with the specified TSFlags.
- static DelayType getDelayType(uint64_t TSFlags) {
- if (TSFlags & SIInstrFlags::TRANS)
+ // Get the delay type for a MachineInstr.
+ DelayType getDelayType(const MachineInstr &MI) {
+ if (SIInstrInfo::isTRANS(MI))
return TRANS;
- if (TSFlags & SIInstrFlags::VALU)
+ // WMMA XDL ops are treated the same as TRANS.
+ if (AMDGPU::isGFX1250(*ST) && SII->isXDLWMMA(MI))
+ return TRANS;
+ if (SIInstrInfo::isVALU(MI))
return VALU;
- if (TSFlags & SIInstrFlags::SALU)
+ if (SIInstrInfo::isSALU(MI))
return SALU;
return OTHER;
}
@@ -368,7 +372,7 @@ public:
continue;
}
- DelayType Type = getDelayType(MI.getDesc().TSFlags);
+ DelayType Type = getDelayType(MI);
if (instructionWaitsForSGPRWrites(MI)) {
auto It = State.find(LastSGPRFromVALU);
@@ -456,12 +460,12 @@ public:
LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
<< "\n");
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (!ST.hasDelayAlu())
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!ST->hasDelayAlu())
return false;
- SII = ST.getInstrInfo();
- TRI = ST.getRegisterInfo();
+ SII = ST->getInstrInfo();
+ TRI = ST->getRegisterInfo();
SchedModel = &SII->getSchedModel();
// Calculate the delay state for each basic block, iterating until we reach
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index b8996fb97f1c..e2c2e8912c71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -700,7 +700,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
break;
}
case Intrinsic::amdgcn_sqrt:
- case Intrinsic::amdgcn_rsq: {
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_tanh: {
Value *Src = II.getArgOperand(0);
if (isa<PoisonValue>(Src))
return IC.replaceInstUsesWith(II, Src);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index ea79c57080fa..1a63c48e3666 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3513,6 +3513,25 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
return Register();
}
+Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
+ Register AnyExtSrc;
+ if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
+ return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
+
+ // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
+ if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+ return Register();
+
+ assert(Def->getNumOperands() == 3 &&
+ MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+
+ if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
+ return Def->getOperand(1).getReg();
+
+ return Register();
+}
+
bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
if (!Subtarget->hasVMemToLDSLoad())
return false;
@@ -4904,6 +4923,7 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
return selectVOP3PRetHelper(Root, true);
}
+// Select neg_lo from the i1 immediate operand.
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
@@ -4919,6 +4939,50 @@ AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
}};
}
+// Select both neg_lo and neg_hi from the i1 immediate operand. This is
+// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies
+// to matrix's even k elements, and neg_hi applies to matrix's odd k elements.
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PModsNegs(MachineOperand &Root) const {
+ // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
+ // Value is in Imm operand as i1 sign extended to int64_t.
+ // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
+ assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
+ "expected i1 value");
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ if (Root.getImm() == -1)
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
+// Select neg, abs, or both neg and abs from the i16 immediate operans.
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PModsNegAbs(MachineOperand &Root) const {
+
+ assert(Root.isImm() && "Modifier for C must be an immediate");
+
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ switch (Root.getImm()) {
+ default: // Any other value will be silently ignored (considered as 0).
+ break;
+ case 1:
+ Mods ^= SISrcMods::NEG;
+ break;
+ case 2:
+ Mods ^= SISrcMods::ABS;
+ break;
+ case 3:
+ Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
+ break;
+ }
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
MachineOperand &Root) const {
@@ -5150,6 +5214,35 @@ AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
+ Register Src =
+ getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
+ unsigned Key = 0;
+
+ Register S32 = matchZeroExtendFromS32(*MRI, Src);
+ if (!S32)
+ S32 = matchAnyExtendFromS32(Src);
+
+ if (S32) {
+ const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
+ if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
+ assert(Def->getNumOperands() == 3);
+ Register DstReg1 = Def->getOperand(1).getReg();
+ if (mi_match(S32, *MRI,
+ m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
+ Src = Def->getOperand(2).getReg();
+ Key = 1;
+ }
+ }
+ }
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 8e9e573147a8..2cb7904d27cc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -201,6 +201,10 @@ private:
InstructionSelector::ComplexRendererFns
selectVOP3PModsNeg(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3PModsNegs(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3PModsNegAbs(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
@@ -217,6 +221,8 @@ private:
selectSWMMACIndex8(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectSWMMACIndex16(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectSWMMACIndex32(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand &Root) const;
@@ -411,6 +417,9 @@ private:
// shift amount operand's `ShAmtBits` bits is unneeded.
bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const;
+ /// Match an any extend from a 32-bit value to 64-bit.
+ Register matchAnyExtendFromS32(Register Reg) const;
+
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
const AMDGPURegisterBankInfo &RBI;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index aa678df675fb..e7bf88d2ee5b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2932,14 +2932,22 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
- MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
- .addDef(PCReg);
+ if (ST.has64BitLiterals()) {
+ assert(GAFlags != SIInstrInfo::MO_NONE);
- MIB.addGlobalAddress(GV, Offset, GAFlags);
- if (GAFlags == SIInstrInfo::MO_NONE)
- MIB.addImm(0);
- else
- MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
+ MachineInstrBuilder MIB =
+ B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
+ MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
+ } else {
+ MachineInstrBuilder MIB =
+ B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
+
+ MIB.addGlobalAddress(GV, Offset, GAFlags);
+ if (GAFlags == SIInstrInfo::MO_NONE)
+ MIB.addImm(0);
+ else
+ MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
+ }
if (!B.getMRI()->getRegClassOrNull(PCReg))
B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
@@ -2955,6 +2963,15 @@ void AMDGPULegalizerInfo::buildAbsGlobalAddress(
MachineRegisterInfo &MRI) const {
bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
+ if (RequiresHighHalf && ST.has64BitLiterals()) {
+ if (!MRI.getRegClassOrNull(DstReg))
+ MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+ B.buildInstr(AMDGPU::S_MOV_B64)
+ .addDef(DstReg)
+ .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
+ return;
+ }
+
LLT S32 = LLT::scalar(32);
// Use the destination directly, if and only if we store the lower address
@@ -7622,6 +7639,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
case Intrinsic::amdgcn_image_bvh8_intersect_ray:
return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B);
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
+ Register Index = MI.getOperand(5).getReg();
+ LLT S64 = LLT::scalar(64);
+ if (MRI.getType(Index) != S64)
+ MI.getOperand(5).setReg(B.buildAnyExt(S64, Index).getReg(0));
+ return true;
+ }
case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
@@ -7636,15 +7667,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
return true;
}
+ case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
+ case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
Register Index = MI.getOperand(7).getReg();
- LLT S32 = LLT::scalar(32);
- if (MRI.getType(Index) != S32)
- MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
+ LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
+ ? LLT::scalar(64)
+ : LLT::scalar(32);
+ if (MRI.getType(Index) != IdxTy)
+ MI.getOperand(7).setReg(B.buildAnyExt(IdxTy, Index).getReg(0));
return true;
}
+
case Intrinsic::amdgcn_fmed3: {
GISelChangeObserver &Observer = Helper.Observer;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 2dec16de940d..c84a0f6e3138 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -50,6 +50,7 @@ static AMDGPUMCExpr::Specifier getSpecifier(unsigned MOFlags) {
default:
return AMDGPUMCExpr::S_None;
case SIInstrInfo::MO_GOTPCREL:
+ case SIInstrInfo::MO_GOTPCREL64:
return AMDGPUMCExpr::S_GOTPCREL;
case SIInstrInfo::MO_GOTPCREL32_LO:
return AMDGPUMCExpr::S_GOTPCREL32_LO;
@@ -59,10 +60,14 @@ static AMDGPUMCExpr::Specifier getSpecifier(unsigned MOFlags) {
return AMDGPUMCExpr::S_REL32_LO;
case SIInstrInfo::MO_REL32_HI:
return AMDGPUMCExpr::S_REL32_HI;
+ case SIInstrInfo::MO_REL64:
+ return AMDGPUMCExpr::S_REL64;
case SIInstrInfo::MO_ABS32_LO:
return AMDGPUMCExpr::S_ABS32_LO;
case SIInstrInfo::MO_ABS32_HI:
return AMDGPUMCExpr::S_ABS32_HI;
+ case SIInstrInfo::MO_ABS64:
+ return AMDGPUMCExpr::S_ABS64;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 5d298304c27f..b6c6d927d0e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -114,7 +114,9 @@ MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUse
MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass())
MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass())
+MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass())
MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass())
+MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass())
MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass())
MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass())
MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
new file mode 100644
index 000000000000..3b06e9b00ac6
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
@@ -0,0 +1,108 @@
+//===-- AMDGPUPrepareAGPRAlloc.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Make simple transformations to relax register constraints for cases which can
+// allocate to AGPRs or VGPRs. Replace materialize of inline immediates into
+// AGPR or VGPR with a pseudo with an AV_* class register constraint. This
+// allows later passes to inflate the register class if necessary. The register
+// allocator does not know to replace instructions to relax constraints.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUPrepareAGPRAlloc.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-prepare-agpr-alloc"
+
+namespace {
+
+class AMDGPUPrepareAGPRAllocImpl {
+private:
+ const SIInstrInfo &TII;
+ MachineRegisterInfo &MRI;
+
+public:
+ AMDGPUPrepareAGPRAllocImpl(const GCNSubtarget &ST, MachineRegisterInfo &MRI)
+ : TII(*ST.getInstrInfo()), MRI(MRI) {}
+ bool run(MachineFunction &MF);
+};
+
+class AMDGPUPrepareAGPRAllocLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) {
+ initializeAMDGPUPrepareAGPRAllocLegacyPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "AMDGPU Prepare AGPR Alloc"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
+ "AMDGPU Prepare AGPR Alloc", false, false)
+INITIALIZE_PASS_END(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
+ "AMDGPU Prepare AGPR Alloc", false, false)
+
+char AMDGPUPrepareAGPRAllocLegacy::ID = 0;
+
+char &llvm::AMDGPUPrepareAGPRAllocLegacyID = AMDGPUPrepareAGPRAllocLegacy::ID;
+
+bool AMDGPUPrepareAGPRAllocLegacy::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ return AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF);
+}
+
+PreservedAnalyses
+AMDGPUPrepareAGPRAllocPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF);
+ return PreservedAnalyses::all();
+}
+
+bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) {
+ if (MRI.isReserved(AMDGPU::AGPR0))
+ return false;
+
+ const MCInstrDesc &AVImmPseudo = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO);
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if ((MI.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
+ TII.isInlineConstant(MI, 1)) ||
+ (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ MI.getOperand(1).isImm())) {
+ MI.setDesc(AVImmPseudo);
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h
new file mode 100644
index 000000000000..dc598c98f241
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h
@@ -0,0 +1,23 @@
+//===- AMDGPUPrepareAGPRAlloc.h ---------------------------------*- C++- *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+class AMDGPUPrepareAGPRAllocPass
+ : public PassInfoMixin<AMDGPUPrepareAGPRAllocPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 7a2a7fc250e2..f5e14c71b02d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -88,7 +88,7 @@ void AMDGPUPrintfRuntimeBindingImpl::getConversionSpecifiers(
// are %p and %s, which use to know if we
// are either storing a literal string or a
// pointer to the printf buffer.
- static const char ConvSpecifiers[] = "cdieEfgGaosuxXp";
+ static const char ConvSpecifiers[] = "cdieEfFgGaAosuxXp";
size_t CurFmtSpecifierIdx = 0;
size_t PrevFmtSpecifierIdx = 0;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 6a59a28b1d32..411159c8aa33 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -23,7 +23,6 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/Support/AMDGPUAddrSpace.h"
#define DEBUG_TYPE "amdgpu-regbanklegalize"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1483d97d23fc..bf2f37bddb9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4546,6 +4546,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp:
+ case Intrinsic::amdgcn_tanh:
case Intrinsic::amdgcn_fmul_legacy:
case Intrinsic::amdgcn_fma_legacy:
case Intrinsic::amdgcn_frexp_mant:
@@ -4557,6 +4558,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_pk_u16:
case Intrinsic::amdgcn_cvt_pk_f16_fp8:
case Intrinsic::amdgcn_cvt_pk_f16_bf8:
+ case Intrinsic::amdgcn_sat_pk4_i4_i8:
+ case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
case Intrinsic::amdgcn_cubeid:
case Intrinsic::amdgcn_cubema:
@@ -4688,6 +4691,44 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x4_f32:
+ case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16:
+ case Intrinsic::amdgcn_wmma_f32_16x16x32_f16:
+ case Intrinsic::amdgcn_wmma_f16_16x16x32_f16:
+ case Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16:
+ case Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16:
+ case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:
+ case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
+ case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
+ case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 46027b889023..8101c6898624 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -167,77 +167,39 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
Info.UsesVCC =
MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+ Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass,
+ /*IncludeCalls=*/false);
+ if (ST.hasMAIInsts())
+ Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass,
+ /*IncludeCalls=*/false);
// If there are no calls, MachineRegisterInfo can tell us the used register
// count easily.
// A tail call isn't considered a call for MachineFrameInfo's purposes.
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
- Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
- Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
- if (ST.hasMAIInsts())
- Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
+ Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
+ /*IncludeCalls=*/false);
return Info;
}
int32_t MaxVGPR = -1;
- int32_t MaxAGPR = -1;
- int32_t MaxSGPR = -1;
Info.CalleeSegmentSize = 0;
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
- // TODO: Check regmasks? Do they occur anywhere except calls?
- for (const MachineOperand &MO : MI.operands()) {
- unsigned Width = 0;
- bool IsSGPR = false;
- bool IsAGPR = false;
+ for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+ const MachineOperand &MO = MI.getOperand(I);
if (!MO.isReg())
continue;
Register Reg = MO.getReg();
switch (Reg) {
- case AMDGPU::EXEC:
- case AMDGPU::EXEC_LO:
- case AMDGPU::EXEC_HI:
- case AMDGPU::SCC:
- case AMDGPU::M0:
- case AMDGPU::M0_LO16:
- case AMDGPU::M0_HI16:
- case AMDGPU::SRC_SHARED_BASE_LO:
- case AMDGPU::SRC_SHARED_BASE:
- case AMDGPU::SRC_SHARED_LIMIT_LO:
- case AMDGPU::SRC_SHARED_LIMIT:
- case AMDGPU::SRC_PRIVATE_BASE_LO:
- case AMDGPU::SRC_PRIVATE_BASE:
- case AMDGPU::SRC_PRIVATE_LIMIT_LO:
- case AMDGPU::SRC_PRIVATE_LIMIT:
- case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
- case AMDGPU::SGPR_NULL:
- case AMDGPU::SGPR_NULL64:
- case AMDGPU::MODE:
- continue;
-
case AMDGPU::NoRegister:
assert(MI.isDebugInstr() &&
"Instruction uses invalid noreg register");
continue;
- case AMDGPU::VCC:
- case AMDGPU::VCC_LO:
- case AMDGPU::VCC_HI:
- case AMDGPU::VCC_LO_LO16:
- case AMDGPU::VCC_LO_HI16:
- case AMDGPU::VCC_HI_LO16:
- case AMDGPU::VCC_HI_HI16:
- Info.UsesVCC = true;
- continue;
-
- case AMDGPU::FLAT_SCR:
- case AMDGPU::FLAT_SCR_LO:
- case AMDGPU::FLAT_SCR_HI:
- continue;
-
case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
@@ -267,170 +229,22 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
break;
}
- if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
- AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
- AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 1;
- } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
- AMDGPU::VGPR_16RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 1;
- } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
- AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 1;
- } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 2;
- } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 2;
- } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 2;
- } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 3;
- } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 3;
- } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 3;
- } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 4;
- } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 4;
- } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 4;
- } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 5;
- } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 5;
- } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 5;
- } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 6;
- } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 6;
- } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 6;
- } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 7;
- } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 7;
- } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 7;
- } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 8;
- } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 8;
- } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 8;
- } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 9;
- } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 9;
- } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 9;
- } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 10;
- } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 10;
- } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 10;
- } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 11;
- } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 11;
- } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 11;
- } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 12;
- } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 12;
- } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 12;
- } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 16;
- } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 16;
- } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 16;
- } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 32;
- } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 32;
- } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 32;
- } else {
- // We only expect TTMP registers or registers that do not belong to
- // any RC.
- assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
- AMDGPU::TTMP_64RegClass.contains(Reg) ||
- AMDGPU::TTMP_128RegClass.contains(Reg) ||
- AMDGPU::TTMP_256RegClass.contains(Reg) ||
- AMDGPU::TTMP_512RegClass.contains(Reg) ||
- !TRI.getPhysRegBaseClass(Reg)) &&
- "Unknown register class");
- }
+ const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);
+ assert((!RC || TRI.isVGPRClass(RC) || TRI.isSGPRClass(RC) ||
+ TRI.isAGPRClass(RC) || AMDGPU::TTMP_32RegClass.contains(Reg) ||
+ AMDGPU::TTMP_64RegClass.contains(Reg) ||
+ AMDGPU::TTMP_128RegClass.contains(Reg) ||
+ AMDGPU::TTMP_256RegClass.contains(Reg) ||
+ AMDGPU::TTMP_512RegClass.contains(Reg)) &&
+ "Unknown register class");
+
+ if (!RC || !TRI.isVGPRClass(RC))
+ continue;
+
+ unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32);
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
- if (IsSGPR) {
- MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
- } else if (IsAGPR) {
- MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
- } else {
- MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
- }
+ MaxVGPR = std::max(MaxUsed, MaxVGPR);
}
if (MI.isCall()) {
@@ -492,9 +306,7 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
}
}
- Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
- Info.NumAGPR = MaxAGPR + 1;
return Info;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 1f6002a3c6a2..dfe0cbf18c47 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -341,6 +341,10 @@ foreach intr = AMDGPUWMMAIntrinsicsGFX11 in
def : SourceOfDivergence<intr>;
foreach intr = AMDGPUWMMAIntrinsicsGFX12 in
def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUWMMAIntrinsicsGFX1250 in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUSWMMACIntrinsicsGFX1250 in
+def : SourceOfDivergence<intr>;
def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 7c24f428d78e..1e44be8e4720 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -59,6 +59,7 @@ protected:
bool HasCvtPkF16F32Inst = false;
bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
+ bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
bool HasMadMixInsts = false;
bool HasMadMacF32Insts = false;
@@ -202,6 +203,8 @@ public:
// supported and the support for fake True16 instructions is removed.
bool useRealTrue16Insts() const;
+ bool hasBF16TransInsts() const { return HasBF16TransInsts; }
+
bool hasBF16ConversionInsts() const {
return HasBF16ConversionInsts;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f4dc4a483181..c865082a1dce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -25,6 +25,7 @@
#include "AMDGPUMacroFusion.h"
#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPUPreloadKernArgProlog.h"
+#include "AMDGPUPrepareAGPRAlloc.h"
#include "AMDGPURemoveIncompatibleFunctions.h"
#include "AMDGPUReserveWWMRegs.h"
#include "AMDGPUResourceUsageAnalysis.h"
@@ -499,6 +500,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeGlobalISel(*PR);
initializeAMDGPUAsmPrinterPass(*PR);
initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
+ initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR);
initializeGCNDPPCombineLegacyPass(*PR);
initializeSILowerI1CopiesLegacyPass(*PR);
initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
@@ -1196,6 +1198,7 @@ public:
bool addRegBankSelect() override;
void addPreGlobalInstructionSelect() override;
bool addGlobalInstructionSelect() override;
+ void addPreRegAlloc() override;
void addFastRegAlloc() override;
void addOptimizedRegAlloc() override;
@@ -1539,6 +1542,11 @@ void GCNPassConfig::addFastRegAlloc() {
TargetPassConfig::addFastRegAlloc();
}
+void GCNPassConfig::addPreRegAlloc() {
+ if (getOptLevel() != CodeGenOptLevel::None)
+ addPass(&AMDGPUPrepareAGPRAllocLegacyID);
+}
+
void GCNPassConfig::addOptimizedRegAlloc() {
if (EnableDCEInRA)
insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
@@ -2235,6 +2243,11 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
Base::addOptimizedRegAlloc(addPass);
}
+void AMDGPUCodeGenPassBuilder::addPreRegAlloc(AddMachinePass &addPass) const {
+ if (getOptLevel() != CodeGenOptLevel::None)
+ addPass(AMDGPUPrepareAGPRAllocPass());
+}
+
Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
AddMachinePass &addPass) const {
// TODO: Check --regalloc-npm option
@@ -2284,6 +2297,12 @@ void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
Base::addPostRegAlloc(addPass);
}
+void AMDGPUCodeGenPassBuilder::addPreSched2(AddMachinePass &addPass) const {
+ if (TM.getOptLevel() > CodeGenOptLevel::None)
+ addPass(SIShrinkInstructionsPass());
+ addPass(SIPostRABundlerPass());
+}
+
void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) {
addPass(GCNCreateVOPDPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 3c62cd19c6e5..e0f1296ddded 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -181,8 +181,11 @@ public:
void addMachineSSAOptimization(AddMachinePass &) const;
void addPostRegAlloc(AddMachinePass &) const;
void addPreEmitPass(AddMachinePass &) const;
+ void addPreEmitRegAlloc(AddMachinePass &) const;
Error addRegAssignmentOptimized(AddMachinePass &) const;
+ void addPreRegAlloc(AddMachinePass &) const;
void addOptimizedRegAlloc(AddMachinePass &) const;
+ void addPreSched2(AddMachinePass &) const;
/// Check if a pass is enabled given \p Opt option. The option always
/// overrides defaults if explicitly used. Otherwise its default will be used
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 6439230b8769..43d4e8db791b 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -157,6 +157,7 @@ public:
ImmTyNegHi,
ImmTyIndexKey8bit,
ImmTyIndexKey16bit,
+ ImmTyIndexKey32bit,
ImmTyDPP8,
ImmTyDppCtrl,
ImmTyDppRowMask,
@@ -174,8 +175,10 @@ public:
ImmTyWaitEXP,
ImmTyWaitVAVDst,
ImmTyWaitVMVSrc,
- ImmTyByteSel,
ImmTyBitOp3,
+ ImmTyMatrixAReuse,
+ ImmTyMatrixBReuse,
+ ImmTyByteSel,
};
// Immediate operand kind.
@@ -419,6 +422,9 @@ public:
bool isCPol() const { return isImmTy(ImmTyCPol); }
bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); }
bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
+ bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); }
+ bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); }
+ bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); }
bool isDppFI() const { return isImmTy(ImmTyDppFI); }
@@ -747,6 +753,10 @@ public:
return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f64);
}
+ bool isVISrc_512_f64() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f64);
+ }
+
bool isVISrc_128B16() const {
return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i16);
}
@@ -1116,6 +1126,7 @@ public:
case ImmTyCPol: OS << "CPol"; break;
case ImmTyIndexKey8bit: OS << "index_key"; break;
case ImmTyIndexKey16bit: OS << "index_key"; break;
+ case ImmTyIndexKey32bit: OS << "index_key"; break;
case ImmTyTFE: OS << "TFE"; break;
case ImmTyD16: OS << "D16"; break;
case ImmTyFORMAT: OS << "FORMAT"; break;
@@ -1162,8 +1173,10 @@ public:
case ImmTyWaitEXP: OS << "WaitEXP"; break;
case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
- case ImmTyByteSel: OS << "ByteSel" ; break;
case ImmTyBitOp3: OS << "BitOp3"; break;
+ case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break;
+ case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break;
+ case ImmTyByteSel: OS << "ByteSel" ; break;
}
// clang-format on
}
@@ -1700,6 +1713,7 @@ public:
AMDGPUOperand::ImmTy ImmTy);
ParseStatus parseIndexKey8bit(OperandVector &Operands);
ParseStatus parseIndexKey16bit(OperandVector &Operands);
+ ParseStatus parseIndexKey32bit(OperandVector &Operands);
ParseStatus parseDfmtNfmt(int64_t &Format);
ParseStatus parseUfmt(int64_t &Format);
@@ -3981,8 +3995,8 @@ bool AMDGPUAsmParser::validateVOPD(const MCInst &Inst,
bool AsVOPD3 = MII.get(Opcode).TSFlags & SIInstrFlags::VOPD3;
if (AsVOPD3) {
- for (unsigned I = 0, E = Operands.size(); I != E; ++I) {
- AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+ for (const std::unique_ptr<MCParsedAsmOperand> &Operand : Operands) {
+ AMDGPUOperand &Op = (AMDGPUOperand &)*Operand;
if ((Op.isRegKind() || Op.isImmTy(AMDGPUOperand::ImmTyNone)) &&
(Op.getModifiers().getFPModifiersOperand() & SISrcMods::ABS))
Error(Op.getStartLoc(), "ABS not allowed in VOPD3 instructions");
@@ -7153,7 +7167,9 @@ ParseStatus AMDGPUAsmParser::tryParseIndexKey(OperandVector &Operands,
if (!Res.isSuccess())
return Res;
- if (ImmTy == AMDGPUOperand::ImmTyIndexKey16bit && (ImmVal < 0 || ImmVal > 1))
+ if ((ImmTy == AMDGPUOperand::ImmTyIndexKey16bit ||
+ ImmTy == AMDGPUOperand::ImmTyIndexKey32bit) &&
+ (ImmVal < 0 || ImmVal > 1))
return Error(Loc, Twine("out of range ", StringRef(Pref)));
if (ImmTy == AMDGPUOperand::ImmTyIndexKey8bit && (ImmVal < 0 || ImmVal > 3))
@@ -7171,6 +7187,10 @@ ParseStatus AMDGPUAsmParser::parseIndexKey16bit(OperandVector &Operands) {
return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey16bit);
}
+ParseStatus AMDGPUAsmParser::parseIndexKey32bit(OperandVector &Operands) {
+ return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey32bit);
+}
+
// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
// values to live in a joint format operand in the MCInst encoding.
ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
@@ -9272,6 +9292,14 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
DefaultVal);
}
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse))
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyMatrixAReuse, 0);
+
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_b_reuse))
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyMatrixBReuse, 0);
+
int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
if (NegLoIdx != -1)
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
@@ -9378,6 +9406,10 @@ void AMDGPUAsmParser::cvtSWMMAC(MCInst &Inst, const OperandVector &Operands) {
addOptionalImmOperand(Inst, Operands, OptIdx,
AMDGPUOperand::ImmTyIndexKey16bit);
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_32bit))
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyIndexKey32bit);
+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClamp);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index e3519f192137..42edec0d0149 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -74,6 +74,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
+ AMDGPUPrepareAGPRAlloc.cpp
AMDGPUSwLowerLDS.cpp
AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 3625db9a4791..c8a4e22ed1da 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -200,6 +200,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
let Inst{95-72} = !if(ps.has_offset, offset, ?);
}
+// TODO: Rename to FlatSaddrTable, it now handles both global and flat GVS addressing mode.
class GlobalSaddrTable <bit is_saddr, string Name = ""> {
bit IsSaddr = is_saddr;
string SaddrOp = Name;
@@ -237,10 +238,18 @@ class FLAT_Load_Pseudo<
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
}
-multiclass FLAT_Load_Pseudo_t16<string opName> {
- def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>;
+multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
+ def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput>,
+ GlobalSaddrTable<0, opName>;
+ let OtherPredicates = [HasFlatGVSMode] in
+ def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
+ GlobalSaddrTable<1, opName>;
+}
+
+multiclass FLAT_Flat_Load_Pseudo_t16<string opName> {
+ defm "" : FLAT_Flat_Load_Pseudo<opName, VGPR_32, 1>;
let True16Predicate = UseRealTrue16Insts in
- def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
+ defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
}
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
@@ -260,10 +269,26 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
let enabled_saddr = EnableSaddr;
}
-multiclass FLAT_Store_Pseudo_t16<string opName> {
- def "" : FLAT_Store_Pseudo<opName, VGPR_32>;
- let OtherPredicates = [HasTrue16BitInsts] in
- def _t16 : FLAT_Store_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>;
+multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> {
+ def "" : FLAT_Store_Pseudo<opName, regClass>,
+ GlobalSaddrTable<0, opName>;
+ let OtherPredicates = [HasFlatGVSMode] in
+ def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
+ GlobalSaddrTable<1, opName>;
+}
+
+multiclass FLAT_Flat_Store_Pseudo_t16<string opName> {
+ defm "" : FLAT_Flat_Store_Pseudo<opName, VGPR_32>;
+
+ defvar Name16 = opName#"_t16";
+ let OtherPredicates = [HasFlatGVSMode, HasTrue16BitInsts] in {
+ def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>,
+ GlobalSaddrTable<0, Name16>,
+ True16D16Table<NAME#"_D16_HI", NAME>;
+ def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>,
+ GlobalSaddrTable<1, Name16>,
+ True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">;
+ }
}
multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
@@ -657,6 +682,18 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN<
let FPAtomic = data_vt.isFP;
let AddedComplexity = -1; // Prefer global atomics if available
}
+
+ def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
+ (outs),
+ (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
+ " $vaddr, $vdata, $saddr$offset$cpol">,
+ GlobalSaddrTable<1, opName> {
+ let OtherPredicates = [HasFlatGVSMode];
+ let has_saddr = 1;
+ let enabled_saddr = 1;
+ let FPAtomic = data_vt.isFP;
+ let AddedComplexity = -1; // Prefer global atomics if available
+ }
}
multiclass FLAT_Atomic_Pseudo_RTN<
@@ -665,15 +702,29 @@ multiclass FLAT_Atomic_Pseudo_RTN<
ValueType vt,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
+ RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
+ RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
def _RTN : FLAT_AtomicRet_Pseudo <opName,
- (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst),
+ (outs vdst_op:$vdst),
(ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
" $vdst, $vaddr, $vdata$offset$cpol">,
GlobalSaddrTable<0, opName#"_rtn"> {
let FPAtomic = data_vt.isFP;
let AddedComplexity = -1; // Prefer global atomics if available
}
+
+ def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
+ (outs vdst_op:$vdst),
+ (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+ " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
+ GlobalSaddrTable<1, opName#"_rtn"> {
+ let OtherPredicates = [HasFlatGVSMode];
+ let has_saddr = 1;
+ let enabled_saddr = 1;
+ let PseudoInstr = NAME#"_SADDR_RTN";
+ let FPAtomic = data_vt.isFP;
+ let AddedComplexity = -1; // Prefer global atomics if available
+ }
}
multiclass FLAT_Atomic_Pseudo<
@@ -762,36 +813,36 @@ multiclass FLAT_Global_Atomic_Pseudo<
// Flat Instructions
//===----------------------------------------------------------------------===//
-def FLAT_LOAD_UBYTE : FLAT_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
-def FLAT_LOAD_SBYTE : FLAT_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
-def FLAT_LOAD_USHORT : FLAT_Load_Pseudo <"flat_load_ushort", VGPR_32>;
-def FLAT_LOAD_SSHORT : FLAT_Load_Pseudo <"flat_load_sshort", VGPR_32>;
-def FLAT_LOAD_DWORD : FLAT_Load_Pseudo <"flat_load_dword", VGPR_32>;
-def FLAT_LOAD_DWORDX2 : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
-def FLAT_LOAD_DWORDX4 : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
-def FLAT_LOAD_DWORDX3 : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
+defm FLAT_LOAD_UBYTE : FLAT_Flat_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
+defm FLAT_LOAD_SBYTE : FLAT_Flat_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
+defm FLAT_LOAD_USHORT : FLAT_Flat_Load_Pseudo <"flat_load_ushort", VGPR_32>;
+defm FLAT_LOAD_SSHORT : FLAT_Flat_Load_Pseudo <"flat_load_sshort", VGPR_32>;
+defm FLAT_LOAD_DWORD : FLAT_Flat_Load_Pseudo <"flat_load_dword", VGPR_32>;
+defm FLAT_LOAD_DWORDX2 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
+defm FLAT_LOAD_DWORDX4 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
+defm FLAT_LOAD_DWORDX3 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
-def FLAT_STORE_DWORD : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>;
-def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
-def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
-def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
+defm FLAT_STORE_DWORD : FLAT_Flat_Store_Pseudo <"flat_store_dword", VGPR_32>;
+defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
+defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
+defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
let SubtargetPredicate = HasD16LoadStore in {
let TiedSourceNotRead = 1 in {
-def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
-def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
-def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">;
+defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_UBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
+defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
+defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SHORT_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_short_d16">;
}
-def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
-def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
+defm FLAT_STORE_BYTE_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
+defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
}
-defm FLAT_STORE_BYTE : FLAT_Store_Pseudo_t16 <"flat_store_byte">;
-defm FLAT_STORE_SHORT : FLAT_Store_Pseudo_t16 <"flat_store_short">;
+defm FLAT_STORE_BYTE : FLAT_Flat_Store_Pseudo_t16 <"flat_store_byte">;
+defm FLAT_STORE_SHORT : FLAT_Flat_Store_Pseudo_t16 <"flat_store_short">;
defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap",
VGPR_32, i32, v2i32, VReg_64>;
@@ -1200,6 +1251,16 @@ class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
(inst $saddr, $voffset, $offset, 0, $in)
>;
+class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)),
+ (inst $saddr, $voffset, $offset, (i32 0), $in)
+>;
+
+class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
+ (inst $saddr, $voffset, $offset, (i32 0))
+>;
+
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
(inst $saddr, $voffset, $offset, (i32 0))
@@ -1210,13 +1271,13 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
(inst $vaddr, $offset)
>;
-class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
(inst $saddr, $voffset, $offset, 0)
>;
-class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
- ValueType vt> : GCNPat <
+class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+ ValueType vt> : GCNPat <
(node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)),
(inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
>;
@@ -1394,7 +1455,7 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
let AddedComplexity = 10;
}
- def : GlobalLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
}
}
@@ -1404,7 +1465,7 @@ multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, Valu
let AddedComplexity = 10;
}
- def : GlobalLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
}
}
@@ -1425,7 +1486,7 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
let AddedComplexity = 10;
}
- def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
}
}
@@ -1435,7 +1496,7 @@ multiclass GlobalFLATStorePats_D16_t16<string inst, SDPatternOperator node, Valu
let AddedComplexity = 10;
}
- def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> {
+ def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> {
let AddedComplexity = 11;
}
}
@@ -1568,80 +1629,129 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu
}
}
+multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat <inst, node, vt>;
+
+ def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
+multiclass FlatLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat_D16 <inst, node, vt>;
+
+ def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
+multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat_D16_t16 <inst, node, vt>;
+
+ def : FlatLoadSaddrPat_D16_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
+multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatStorePat <inst, node, vt>;
+
+ def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
+multiclass FlatStorePats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatStorePat <!cast<FLAT_Pseudo>(!cast<string>(inst)#"_t16"), node, vt>;
+
+ def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR_t16"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
let OtherPredicates = [HasFlatAddressSpace] in {
-def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i16>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
let True16Predicate = p in {
- def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
- def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_USHORT, load_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
+ defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+ defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
+ defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
+ defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
}
let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_BYTE_t16, truncstorei8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_SHORT_t16, store_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
+ defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+ defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>;
def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>;
} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
-def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
+defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>;
-def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
foreach vt = Reg32Types.types in {
-def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>;
-def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORD, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORD, store_flat, vt>;
}
foreach vt = VReg_64.RegTypes in {
-def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, load_flat, vt>;
}
-def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32>;
+defm : FlatStorePats <FLAT_STORE_DWORDX3, store_flat, v3i32>;
foreach vt = VReg_128.RegTypes in {
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX4, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>;
}
-def : FlatStorePat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
-def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
+
foreach as = [ "flat", "global" ] in {
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>;
@@ -1684,6 +1794,9 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
} // end foreach as
+defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
+
let SubtargetPredicate = isGFX12Plus in {
defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >;
@@ -1692,25 +1805,25 @@ let SubtargetPredicate = isGFX12Plus in {
}
let OtherPredicates = [HasD16LoadStore] in {
-def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
-def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
}
let OtherPredicates = [D16PreservesUnusedBits] in {
// TODO: Handle atomic loads
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
}
} // End OtherPredicates = [HasFlatAddressSpace]
@@ -1782,6 +1895,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>;
// appropriate waits.
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
@@ -1821,6 +1935,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
@@ -2832,14 +2947,7 @@ multiclass VFLAT_Real_Base_gfx12<bits<8> op,
VFLAT_Aliases_gfx12<name, alias>,
VFLAT_Real_gfx12<op, name>;
-multiclass VFLAT_Real_Atomics_gfx12<bits<8> op,
- string name = get_FLAT_ps<NAME>.Mnemonic,
- string alias = name> :
- VFLAT_Real_Base_gfx12<op, name, alias> {
- defm _RTN : VFLAT_Real_gfx12<op, name>;
-}
-
-multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op,
+multiclass VFLAT_Real_AllAddr_gfx12<bits<8> op,
string name = get_FLAT_ps<NAME>.Mnemonic,
string alias = name> :
VFLAT_Real_Base_gfx12<op, name, alias> {
@@ -2853,7 +2961,7 @@ multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> {
}
}
-multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
+multiclass VFLAT_Real_AllAddr_gfx12_w64<bits<8> op,
string name = get_FLAT_ps<NAME>.Mnemonic> :
VFLAT_Aliases_gfx12<name> {
let DecoderNamespace = "GFX12W64" in {
@@ -2862,10 +2970,10 @@ multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
}
}
-multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op,
+multiclass VFLAT_Real_Atomics_gfx12<bits<8> op,
string name = get_FLAT_ps<NAME>.Mnemonic,
string alias = name> :
- VGLOBAL_Real_AllAddr_gfx12<op, name, alias> {
+ VFLAT_Real_AllAddr_gfx12<op, name, alias> {
defm _RTN : VFLAT_Real_gfx12<op, name>;
defm _SADDR_RTN : VFLAT_Real_gfx12<op, name>;
}
@@ -2879,28 +2987,28 @@ multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op,
}
// ENC_VFLAT.
-defm FLAT_LOAD_UBYTE : VFLAT_Real_Base_gfx12<0x010, "flat_load_u8">;
-defm FLAT_LOAD_SBYTE : VFLAT_Real_Base_gfx12<0x011, "flat_load_i8">;
-defm FLAT_LOAD_USHORT : VFLAT_Real_Base_gfx12<0x012, "flat_load_u16">;
-defm FLAT_LOAD_SSHORT : VFLAT_Real_Base_gfx12<0x013, "flat_load_i16">;
-defm FLAT_LOAD_DWORD : VFLAT_Real_Base_gfx12<0x014, "flat_load_b32">;
-defm FLAT_LOAD_DWORDX2 : VFLAT_Real_Base_gfx12<0x015, "flat_load_b64">;
-defm FLAT_LOAD_DWORDX3 : VFLAT_Real_Base_gfx12<0x016, "flat_load_b96">;
-defm FLAT_LOAD_DWORDX4 : VFLAT_Real_Base_gfx12<0x017, "flat_load_b128">;
-defm FLAT_STORE_BYTE : VFLAT_Real_Base_gfx12<0x018, "flat_store_b8">;
-defm FLAT_STORE_SHORT : VFLAT_Real_Base_gfx12<0x019, "flat_store_b16">;
-defm FLAT_STORE_DWORD : VFLAT_Real_Base_gfx12<0x01a, "flat_store_b32">;
-defm FLAT_STORE_DWORDX2 : VFLAT_Real_Base_gfx12<0x01b, "flat_store_b64">;
-defm FLAT_STORE_DWORDX3 : VFLAT_Real_Base_gfx12<0x01c, "flat_store_b96">;
-defm FLAT_STORE_DWORDX4 : VFLAT_Real_Base_gfx12<0x01d, "flat_store_b128">;
-defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_Base_gfx12<0x01e, "flat_load_d16_u8">;
-defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_Base_gfx12<0x01f, "flat_load_d16_i8">;
-defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_Base_gfx12<0x020, "flat_load_d16_b16">;
-defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x021, "flat_load_d16_hi_u8">;
-defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x022, "flat_load_d16_hi_i8">;
-defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x023, "flat_load_d16_hi_b16">;
-defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_Base_gfx12<0x024, "flat_store_d16_hi_b8">;
-defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x025, "flat_store_d16_hi_b16">;
+defm FLAT_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "flat_load_u8">;
+defm FLAT_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "flat_load_i8">;
+defm FLAT_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "flat_load_u16">;
+defm FLAT_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "flat_load_i16">;
+defm FLAT_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "flat_load_b32">;
+defm FLAT_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "flat_load_b64">;
+defm FLAT_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "flat_load_b96">;
+defm FLAT_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "flat_load_b128">;
+defm FLAT_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "flat_store_b8">;
+defm FLAT_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "flat_store_b16">;
+defm FLAT_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "flat_store_b32">;
+defm FLAT_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "flat_store_b64">;
+defm FLAT_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "flat_store_b96">;
+defm FLAT_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "flat_store_b128">;
+defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "flat_load_d16_u8">;
+defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "flat_load_d16_i8">;
+defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "flat_load_d16_b16">;
+defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "flat_load_d16_hi_u8">;
+defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "flat_load_d16_hi_i8">;
+defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "flat_load_d16_hi_b16">;
+defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "flat_store_d16_hi_b8">;
+defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "flat_store_d16_hi_b16">;
defm FLAT_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "flat_atomic_swap_b32">;
defm FLAT_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "flat_atomic_cmpswap_b32">;
defm FLAT_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "flat_atomic_add_u32">;
@@ -2936,74 +3044,74 @@ defm FLAT_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>;
defm FLAT_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>;
// ENC_VGLOBAL.
-defm GLOBAL_LOAD_UBYTE : VGLOBAL_Real_AllAddr_gfx12<0x010, "global_load_u8">;
-defm GLOBAL_LOAD_SBYTE : VGLOBAL_Real_AllAddr_gfx12<0x011, "global_load_i8">;
-defm GLOBAL_LOAD_USHORT : VGLOBAL_Real_AllAddr_gfx12<0x012, "global_load_u16">;
-defm GLOBAL_LOAD_SSHORT : VGLOBAL_Real_AllAddr_gfx12<0x013, "global_load_i16">;
-defm GLOBAL_LOAD_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x014, "global_load_b32">;
-defm GLOBAL_LOAD_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x015, "global_load_b64">;
-defm GLOBAL_LOAD_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x016, "global_load_b96">;
-defm GLOBAL_LOAD_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x017, "global_load_b128">;
-defm GLOBAL_STORE_BYTE : VGLOBAL_Real_AllAddr_gfx12<0x018, "global_store_b8">;
-defm GLOBAL_STORE_SHORT : VGLOBAL_Real_AllAddr_gfx12<0x019, "global_store_b16">;
-defm GLOBAL_STORE_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x01a, "global_store_b32">;
-defm GLOBAL_STORE_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x01b, "global_store_b64">;
-defm GLOBAL_STORE_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x01c, "global_store_b96">;
-defm GLOBAL_STORE_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x01d, "global_store_b128">;
-defm GLOBAL_LOAD_UBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">;
-defm GLOBAL_LOAD_SBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">;
-defm GLOBAL_LOAD_SHORT_D16 : VGLOBAL_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">;
-defm GLOBAL_LOAD_UBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">;
-defm GLOBAL_LOAD_SBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">;
-defm GLOBAL_LOAD_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">;
-defm GLOBAL_STORE_BYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">;
-defm GLOBAL_STORE_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">;
-defm GLOBAL_LOAD_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">;
-defm GLOBAL_STORE_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">;
-defm GLOBAL_LOAD_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x053>;
-defm GLOBAL_STORE_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x054>;
-
-defm GLOBAL_ATOMIC_SWAP : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">;
-defm GLOBAL_ATOMIC_CMPSWAP : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">;
-defm GLOBAL_ATOMIC_ADD : VGLOBAL_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">;
-defm GLOBAL_ATOMIC_SUB : VGLOBAL_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">;
-defm GLOBAL_ATOMIC_CSUB : VGLOBAL_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">;
-defm GLOBAL_ATOMIC_SMIN : VGLOBAL_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">;
-defm GLOBAL_ATOMIC_UMIN : VGLOBAL_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">;
-defm GLOBAL_ATOMIC_SMAX : VGLOBAL_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">;
-defm GLOBAL_ATOMIC_UMAX : VGLOBAL_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">;
-defm GLOBAL_ATOMIC_AND : VGLOBAL_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">;
-defm GLOBAL_ATOMIC_OR : VGLOBAL_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">;
-defm GLOBAL_ATOMIC_XOR : VGLOBAL_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">;
-defm GLOBAL_ATOMIC_INC : VGLOBAL_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">;
-defm GLOBAL_ATOMIC_DEC : VGLOBAL_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">;
-defm GLOBAL_ATOMIC_SWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">;
-defm GLOBAL_ATOMIC_CMPSWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">;
-defm GLOBAL_ATOMIC_ADD_X2 : VGLOBAL_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">;
-defm GLOBAL_ATOMIC_SUB_X2 : VGLOBAL_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">;
-defm GLOBAL_ATOMIC_SMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">;
-defm GLOBAL_ATOMIC_UMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">;
-defm GLOBAL_ATOMIC_SMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">;
-defm GLOBAL_ATOMIC_UMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">;
-defm GLOBAL_ATOMIC_AND_X2 : VGLOBAL_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">;
-defm GLOBAL_ATOMIC_OR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">;
-defm GLOBAL_ATOMIC_XOR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">;
-defm GLOBAL_ATOMIC_INC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">;
-defm GLOBAL_ATOMIC_DEC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">;
-defm GLOBAL_ATOMIC_COND_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x050>;
-defm GLOBAL_ATOMIC_FMIN : VGLOBAL_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">;
-defm GLOBAL_ATOMIC_FMAX : VGLOBAL_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">;
-defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056>;
+defm GLOBAL_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "global_load_u8">;
+defm GLOBAL_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "global_load_i8">;
+defm GLOBAL_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "global_load_u16">;
+defm GLOBAL_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "global_load_i16">;
+defm GLOBAL_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "global_load_b32">;
+defm GLOBAL_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "global_load_b64">;
+defm GLOBAL_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "global_load_b96">;
+defm GLOBAL_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "global_load_b128">;
+defm GLOBAL_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "global_store_b8">;
+defm GLOBAL_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "global_store_b16">;
+defm GLOBAL_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "global_store_b32">;
+defm GLOBAL_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "global_store_b64">;
+defm GLOBAL_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "global_store_b96">;
+defm GLOBAL_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "global_store_b128">;
+defm GLOBAL_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">;
+defm GLOBAL_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">;
+defm GLOBAL_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">;
+defm GLOBAL_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">;
+defm GLOBAL_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">;
+defm GLOBAL_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">;
+defm GLOBAL_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">;
+defm GLOBAL_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">;
+defm GLOBAL_LOAD_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">;
+defm GLOBAL_STORE_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">;
+defm GLOBAL_LOAD_BLOCK : VFLAT_Real_AllAddr_gfx12<0x053>;
+defm GLOBAL_STORE_BLOCK : VFLAT_Real_AllAddr_gfx12<0x054>;
+
+defm GLOBAL_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">;
+defm GLOBAL_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">;
+defm GLOBAL_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">;
+defm GLOBAL_ATOMIC_SUB : VFLAT_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">;
+defm GLOBAL_ATOMIC_CSUB : VFLAT_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">;
+defm GLOBAL_ATOMIC_SMIN : VFLAT_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">;
+defm GLOBAL_ATOMIC_UMIN : VFLAT_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">;
+defm GLOBAL_ATOMIC_SMAX : VFLAT_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">;
+defm GLOBAL_ATOMIC_UMAX : VFLAT_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">;
+defm GLOBAL_ATOMIC_AND : VFLAT_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">;
+defm GLOBAL_ATOMIC_OR : VFLAT_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">;
+defm GLOBAL_ATOMIC_XOR : VFLAT_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">;
+defm GLOBAL_ATOMIC_INC : VFLAT_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">;
+defm GLOBAL_ATOMIC_DEC : VFLAT_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">;
+defm GLOBAL_ATOMIC_SWAP_X2 : VFLAT_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">;
+defm GLOBAL_ATOMIC_CMPSWAP_X2 : VFLAT_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">;
+defm GLOBAL_ATOMIC_ADD_X2 : VFLAT_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">;
+defm GLOBAL_ATOMIC_SUB_X2 : VFLAT_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">;
+defm GLOBAL_ATOMIC_SMIN_X2 : VFLAT_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">;
+defm GLOBAL_ATOMIC_UMIN_X2 : VFLAT_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">;
+defm GLOBAL_ATOMIC_SMAX_X2 : VFLAT_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">;
+defm GLOBAL_ATOMIC_UMAX_X2 : VFLAT_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">;
+defm GLOBAL_ATOMIC_AND_X2 : VFLAT_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">;
+defm GLOBAL_ATOMIC_OR_X2 : VFLAT_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">;
+defm GLOBAL_ATOMIC_XOR_X2 : VFLAT_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">;
+defm GLOBAL_ATOMIC_INC_X2 : VFLAT_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">;
+defm GLOBAL_ATOMIC_DEC_X2 : VFLAT_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">;
+defm GLOBAL_ATOMIC_COND_SUB_U32 : VFLAT_Real_Atomics_gfx12<0x050>;
+defm GLOBAL_ATOMIC_FMIN : VFLAT_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">;
+defm GLOBAL_ATOMIC_FMAX : VFLAT_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">;
+defm GLOBAL_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056>;
defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x057>;
defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x058>;
-defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x057>;
-defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x058>;
+defm GLOBAL_LOAD_TR_B128_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x057>;
+defm GLOBAL_LOAD_TR_B64_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x058>;
-defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>;
-defm GLOBAL_ATOMIC_PK_ADD_F16 : VGLOBAL_Real_Atomics_gfx12<0x059>;
-defm GLOBAL_ATOMIC_PK_ADD_BF16 : VGLOBAL_Real_Atomics_gfx12<0x05a>;
+defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VFLAT_Real_Atomics_gfx12<0x073>;
+defm GLOBAL_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>;
+defm GLOBAL_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>;
defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b>;
defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c>;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 0976fccf78d8..bbed828b4fed 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1189,6 +1189,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
}
fixVALUPartialForwardingHazard(MI);
fixVALUTransUseHazard(MI);
+ fixVALUTransCoexecutionHazards(MI);
fixWMMAHazards(MI);
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
@@ -1809,6 +1810,51 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
return true;
}
+bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
+ if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled.
+ !SIInstrInfo::isVALU(*MI) || SIInstrInfo::isTRANS(*MI))
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
+ if (!SIInstrInfo::isTRANS(I))
+ return false;
+
+ // RAW: Trans(I) writes, VALU(MI) reads.
+ Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+ for (const MachineOperand &ValuUse : MI->explicit_uses()) {
+ if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
+ return true;
+ }
+
+ auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
+ if (!ValuDst || !ValuDst->isReg())
+ return false;
+
+ // WAR: Trans(I) reads, VALU(MI) writes.
+ Register ValuDef = ValuDst->getReg();
+ for (const MachineOperand &TransUse : I.explicit_uses()) {
+ if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
+ return true;
+ }
+
+ return false;
+ };
+
+ auto IsExpiredFn = [](const MachineInstr &I, int) {
+ return SIInstrInfo::isVALU(I);
+ };
+
+ const int HasVALU = std::numeric_limits<int>::max();
+ if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
+ return true;
+}
+
bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
return false;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index bbc55851bf96..ef6ddd874f58 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -104,6 +104,7 @@ private:
bool fixLdsDirectVMEMHazard(MachineInstr *MI);
bool fixVALUPartialForwardingHazard(MachineInstr *MI);
bool fixVALUTransUseHazard(MachineInstr *MI);
+ bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
bool fixWMMAHazards(MachineInstr *MI);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index fce8f36d4596..a6553083d722 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -803,7 +803,8 @@ void GCNScheduleDAGMILive::schedule() {
GCNRegPressure
GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
GCNDownwardRPTracker RPTracker(*LIS);
- RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
+ RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
+ &LiveIns[RegionIdx]);
return RPTracker.moveMaxPressure();
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index e6dd98a10420..268162bcada4 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -214,6 +214,7 @@ protected:
bool FlatInstOffsets = false;
bool FlatGlobalInsts = false;
bool FlatScratchInsts = false;
+ bool FlatGVSMode = false;
bool ScalarFlatScratchInsts = false;
bool HasArchitectedFlatScratch = false;
bool EnableFlatScratch = false;
@@ -233,6 +234,7 @@ protected:
bool HasRestrictedSOffset = false;
bool Has64BitLiterals = false;
bool HasBitOp3Insts = false;
+ bool HasTanhInsts = false;
bool HasTransposeLoadF4F6Insts = false;
bool HasPrngInst = false;
bool HasBVHDualAndBVH8Insts = false;
@@ -1156,10 +1158,12 @@ public:
bool hasMadF16() const;
- bool hasMovB64() const { return GFX940Insts; }
+ bool hasMovB64() const { return GFX940Insts || GFX1250Insts; }
bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
+ bool hasFlatGVSMode() const { return FlatGVSMode; }
+
bool enableSIScheduler() const {
return EnableSIScheduler;
}
@@ -1377,6 +1381,10 @@ public:
return HasMinimum3Maximum3F16;
}
+ bool hasTanhInsts() const { return HasTanhInsts; }
+
+ bool hasAddPC64Inst() const { return GFX1250Insts; }
+
bool hasMinimum3Maximum3PKF16() const {
return HasMinimum3Maximum3PKF16;
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index e7d0e1838fa6..2a920f6feb1c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -108,7 +108,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
MCContext *Ctx) {
int64_t SignedValue = static_cast<int64_t>(Value);
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
case AMDGPU::fixup_si_sopp_br: {
int64_t BrImm = (SignedValue - 4) / 4;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 22ae5f4e7191..0d5a8be6220d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -64,6 +64,8 @@ unsigned AMDGPUELFObjectWriter::getRelocType(const MCFixup &Fixup,
return ELF::R_AMDGPU_ABS32_LO;
case AMDGPUMCExpr::S_ABS32_HI:
return ELF::R_AMDGPU_ABS32_HI;
+ case AMDGPUMCExpr::S_ABS64:
+ return ELF::R_AMDGPU_ABS64;
}
MCFixupKind Kind = Fixup.getKind();
@@ -76,7 +78,7 @@ unsigned AMDGPUELFObjectWriter::getRelocType(const MCFixup &Fixup,
return IsPCRel ? ELF::R_AMDGPU_REL64 : ELF::R_AMDGPU_ABS64;
}
- if (Fixup.getTargetKind() == AMDGPU::fixup_si_sopp_br) {
+ if (Fixup.getKind() == AMDGPU::fixup_si_sopp_br) {
const auto *SymA = Target.getAddSym();
assert(SymA);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index cb6319ed627c..ec9248b972ec 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1332,6 +1332,16 @@ void AMDGPUInstPrinter::printIndexKey16bit(const MCInst *MI, unsigned OpNo,
O << " index_key:" << Imm;
}
+void AMDGPUInstPrinter::printIndexKey32bit(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
+ if (Imm == 0)
+ return;
+
+ O << " index_key:" << Imm;
+}
+
void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index fb803b1f8134..e3299a618e88 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -132,6 +132,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printIndexKey16bit(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printIndexKey32bit(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpSlot(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpAttr(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 31dd373e54fb..ffdac8b8ce32 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -25,6 +25,7 @@ const MCAsmInfo::AtSpecifier atSpecifiers[] = {
{AMDGPUMCExpr::S_REL64, "rel64"},
{AMDGPUMCExpr::S_ABS32_LO, "abs32@lo"},
{AMDGPUMCExpr::S_ABS32_HI, "abs32@hi"},
+ {AMDGPUMCExpr::S_ABS64, "abs64"},
};
AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 4bb3942936f0..f48739fe0181 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -381,9 +381,11 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
// Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions.
// Note that accvgpr_read/write are MAI, have src0, but do not use op_sel.
- if ((Desc.TSFlags & SIInstrFlags::VOP3P) ||
- Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
- Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) {
+ if (((Desc.TSFlags & SIInstrFlags::VOP3P) ||
+ Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
+ Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) &&
+ // Matrix B reuse operand reuses op_sel_hi.
+ !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) {
Encoding |= getImplicitOpSelHiEncoding(Opcode);
}
@@ -562,7 +564,8 @@ static bool needsPCRel(const MCExpr *Expr) {
case MCExpr::SymbolRef: {
auto *SE = cast<MCSymbolRefExpr>(Expr);
auto Spec = AMDGPU::getSpecifier(SE);
- return Spec != AMDGPUMCExpr::S_ABS32_LO && Spec != AMDGPUMCExpr::S_ABS32_HI;
+ return Spec != AMDGPUMCExpr::S_ABS32_LO &&
+ Spec != AMDGPUMCExpr::S_ABS32_HI && Spec != AMDGPUMCExpr::S_ABS64;
}
case MCExpr::Binary: {
auto *BE = cast<MCBinaryExpr>(Expr);
@@ -685,7 +688,12 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
uint32_t Offset = Desc.getSize();
assert(Offset == 4 || Offset == 8);
- addFixup(Fixups, Offset, MO.getExpr(), FK_Data_4, PCRel);
+ auto OpType = Desc.operands()[OpNo].OperandType;
+ MCFixupKind Kind = (STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
+ OpType == AMDGPU::OPERAND_REG_IMM_INT64)
+ ? FK_Data_8
+ : FK_Data_4;
+ addFixup(Fixups, Offset, MO.getExpr(), Kind, PCRel);
}
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index e1b9720cdbfc..bc6fdf7f2e4c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -50,6 +50,7 @@ public:
S_REL64, // symbol@rel64
S_ABS32_LO, // symbol@abs32@lo
S_ABS32_HI, // symbol@abs32@hi
+ S_ABS64, // symbol@abs64
};
private:
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 9b5a46395695..f018f77bc83e 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -378,6 +378,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
default:
return false;
case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
SMovOp = AMDGPU::S_MOV_B32;
break;
case AMDGPU::V_MOV_B64_PSEUDO:
@@ -946,13 +947,18 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
// Copies and REG_SEQUENCE do not contribute to the final assembly
// So, skip them but take care of the SGPR to VGPR copies bookkeeping.
- if (Inst->isCopy() || Inst->isRegSequence()) {
- if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
- if (!Inst->isCopy() ||
- !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
- Info.NumSVCopies++;
- continue;
- }
+ if (Inst->isRegSequence() &&
+ TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
+ Info.NumSVCopies++;
+ continue;
+ }
+ if (Inst->isCopy()) {
+ const TargetRegisterClass *SrcRC, *DstRC;
+ std::tie(SrcRC, DstRC) = getCopyRegClasses(*Inst, *TRI, *MRI);
+ if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI) &&
+ !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
+ Info.NumSVCopies++;
+ continue;
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0ed06c37507a..e172c0b63189 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1761,6 +1761,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
for (MachineInstr *Copy : CopiesToReplace)
Copy->addImplicitDefUseOperands(*MF);
+ SetVector<MachineInstr *> ConstantFoldCandidates;
for (FoldCandidate &Fold : FoldList) {
assert(!Fold.isReg() || Fold.Def.OpToFold);
if (Fold.isReg() && Fold.getReg().isVirtual()) {
@@ -1783,16 +1784,21 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
<< static_cast<int>(Fold.UseOpNo) << " of "
<< *Fold.UseMI);
- if (Fold.isImm() && tryConstantFoldOp(Fold.UseMI)) {
- LLVM_DEBUG(dbgs() << "Constant folded " << *Fold.UseMI);
- Changed = true;
- }
+ if (Fold.isImm())
+ ConstantFoldCandidates.insert(Fold.UseMI);
} else if (Fold.Commuted) {
// Restoring instruction's original operand order if fold has failed.
TII->commuteInstruction(*Fold.UseMI, false);
}
}
+
+ for (MachineInstr *MI : ConstantFoldCandidates) {
+ if (tryConstantFoldOp(MI)) {
+ LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
+ Changed = true;
+ }
+ }
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e2a10be4c2c7..0c76ff2ec5ea 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -33,6 +33,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -46,6 +47,7 @@
#include <optional>
using namespace llvm;
+using namespace llvm::SDPatternMatch;
#define DEBUG_TYPE "si-lower"
@@ -938,6 +940,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
}
+ if (Subtarget->hasBF16TransInsts()) {
+ setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
+ }
+
if (Subtarget->hasCvtPkF16F32Inst()) {
setOperationAction(ISD::FP_ROUND,
{MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
@@ -3893,7 +3899,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// arguments to begin at SP+0. Completely unused for non-tail calls.
int32_t FPDiff = 0;
MachineFrameInfo &MFI = MF.getFrameInfo();
- auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+ auto *TRI = Subtarget->getRegisterInfo();
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
@@ -8162,6 +8168,14 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
// $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
// which is a 64-bit pc-relative offset from the encoding of the $symbol
// operand to the global variable.
+ if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
+ assert(GAFlags != SIInstrInfo::MO_NONE);
+
+ SDValue Ptr =
+ DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
+ return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
+ }
+
SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
SDValue PtrHi;
if (GAFlags == SIInstrInfo::MO_NONE)
@@ -8211,6 +8225,13 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
}
if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
+ if (Subtarget->has64BitLiterals()) {
+ SDValue Addr = DAG.getTargetGlobalAddress(
+ GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
+ return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
+ 0);
+ }
+
SDValue AddrLo = DAG.getTargetGlobalAddress(
GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
@@ -9289,7 +9310,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_reloc_constant: {
- Module *M = const_cast<Module *>(MF.getFunction().getParent());
+ Module *M = MF.getFunction().getParent();
const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
auto *RelocSymbol = cast<GlobalVariable>(
@@ -9315,6 +9336,44 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
Op.getOperand(3), IndexKeyi32);
}
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
+ if (Op.getOperand(4).getValueType() == MVT::i64)
+ return SDValue();
+
+ SDLoc SL(Op);
+ auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
+ {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+ Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
+ Op.getOperand(6)});
+ }
+ case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
+ case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
+ EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
+ ? MVT::i64
+ : MVT::i32;
+ if (Op.getOperand(6).getValueType() == IndexKeyTy)
+ return SDValue();
+
+ SDLoc SL(Op);
+ auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
+ {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+ Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
+ IndexKey, Op.getOperand(7),
+ Op.getOperand(8)}); // No clamp operand
+ }
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
@@ -11074,7 +11133,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
assert(VT.getSizeInBits() == 64);
SDLoc DL(Op);
- SDValue Cond = Op.getOperand(0);
+ SDValue Cond = DAG.getFreeze(Op.getOperand(0));
SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
SDValue One = DAG.getConstant(1, DL, MVT::i32);
@@ -12155,6 +12214,11 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
bitOpWithConstantIsReducible(Opc, ValHi)) ||
(CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
+ // We have 64-bit scalar and/or/xor, but do not have vector forms.
+ if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
+ !CRHS->user_begin()->isDivergent())
+ return SDValue();
+
// If we need to materialize a 64-bit immediate, it will be split up later
// anyway. Avoid creating the harder to understand 64-bit immediate
// materialization.
@@ -13660,6 +13724,7 @@ bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
case Intrinsic::amdgcn_frexp_mant:
case Intrinsic::amdgcn_fdot2:
case Intrinsic::amdgcn_trig_preop:
+ case Intrinsic::amdgcn_tanh:
return true;
default:
break;
@@ -14498,7 +14563,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
// instead of a tree.
SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
DAGCombinerInfo &DCI) const {
- assert(N->getOpcode() == ISD::ADD);
+ assert(N->isAnyAdd());
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
@@ -14531,7 +14596,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
for (SDNode *User : LHS->users()) {
// There is a use that does not feed into addition, so the multiply can't
// be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
- if (User->getOpcode() != ISD::ADD)
+ if (!User->isAnyAdd())
return SDValue();
// We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
@@ -14643,8 +14708,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
SDValue Hi = getHiHalf64(LHS, DAG);
SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == ISD::PTRADD)
+ Opcode = ISD::ADD;
SDValue AddHi =
- DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
+ DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
@@ -15118,42 +15186,123 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
+ EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- if (N1.getOpcode() == ISD::ADD) {
- // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
- // y is not, and (add y, z) is used only once.
- // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
- // z is not, and (add y, z) is used only once.
- // The goal is to move constant offsets to the outermost ptradd, to create
- // more opportunities to fold offsets into memory instructions.
- // Together with the generic combines in DAGCombiner.cpp, this also
- // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
- //
- // This transform is here instead of in the general DAGCombiner as it can
- // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
- // AArch64's CPA.
- SDValue X = N0;
- SDValue Y = N1.getOperand(0);
- SDValue Z = N1.getOperand(1);
- if (N1.hasOneUse()) {
- bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
- bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
- if (ZIsConstant != YIsConstant) {
- // If both additions in the original were NUW, the new ones are as well.
- SDNodeFlags Flags =
- (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
- if (YIsConstant)
- std::swap(Y, Z);
+ // The following folds transform PTRADDs into regular arithmetic in cases
+ // where the PTRADD wouldn't be folded as an immediate offset into memory
+ // instructions anyway. They are target-specific in that other targets might
+ // prefer to not lose information about the pointer arithmetic.
+
+ // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
+ // Adapted from DAGCombiner::visitADDLikeCommutative.
+ SDValue V, K;
+ if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
+ SDNodeFlags ShlFlags = N1->getFlags();
+ // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
+ // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
+ // preserved.
+ SDNodeFlags NewShlFlags =
+ ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
+ ? SDNodeFlags::NoSignedWrap
+ : SDNodeFlags();
+ SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
+ }
+
+ // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
+ // performAddCombine.
+ if (N1.getOpcode() == ISD::MUL) {
+ if (Subtarget->hasMad64_32()) {
+ if (SDValue Folded = tryFoldToMad64_32(N, DCI))
+ return Folded;
+ }
+ }
+
+ // If the 32 low bits of the constant are all zero, there is nothing to fold
+ // into an immediate offset, so it's better to eliminate the unnecessary
+ // addition for the lower 32 bits than to preserve the PTRADD.
+ // Analogous to a fold in performAddCombine.
+ if (VT == MVT::i64) {
+ if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
+ return Folded;
+ }
- SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
+ if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
+ // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
+ // global address GA and constant c, such that c can be folded into GA.
+ SDValue GAValue = N0.getOperand(0);
+ if (const GlobalAddressSDNode *GA =
+ dyn_cast<GlobalAddressSDNode>(GAValue)) {
+ if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
+ // If both additions in the original were NUW, reassociation preserves
+ // that.
+ SDNodeFlags Flags =
+ (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+ SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
DCI.AddToWorklist(Inner.getNode());
- return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
+ return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
}
}
}
+ if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
+ return SDValue();
+
+ // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
+ // y is not, and (add y, z) is used only once.
+ // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
+ // z is not, and (add y, z) is used only once.
+ // The goal is to move constant offsets to the outermost ptradd, to create
+ // more opportunities to fold offsets into memory instructions.
+ // Together with the generic combines in DAGCombiner.cpp, this also
+ // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
+ //
+ // This transform is here instead of in the general DAGCombiner as it can
+ // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
+ // AArch64's CPA.
+ SDValue X = N0;
+ SDValue Y = N1.getOperand(0);
+ SDValue Z = N1.getOperand(1);
+ bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
+ bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
+
+ // If both additions in the original were NUW, reassociation preserves that.
+ SDNodeFlags ReassocFlags =
+ (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+
+ if (ZIsConstant != YIsConstant) {
+ if (YIsConstant)
+ std::swap(Y, Z);
+ SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
+ }
+
+ // If one of Y and Z is constant, they have been handled above. If both were
+ // constant, the addition would have been folded in SelectionDAG::getNode
+ // already. This ensures that the generic DAG combines won't undo the
+ // following reassociation.
+ assert(!YIsConstant && !ZIsConstant);
+
+ if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
+ // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
+ // y are uniform and z isn't.
+ // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
+ // z are uniform and y isn't.
+ // The goal is to push uniform operands up in the computation, so that they
+ // can be handled with scalar operations. We can't use reassociateScalarOps
+ // for this since it requires two identical commutative operations to
+ // reassociate.
+ if (Y->isDivergent())
+ std::swap(Y, Z);
+ SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
+ DCI.AddToWorklist(UniformInner.getNode());
+ return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
+ }
+
return SDValue();
}
@@ -16847,12 +16996,63 @@ static void knownBitsForWorkitemID(const GCNSubtarget &ST,
Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
}
+static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT,
+ KnownBits &Known, const APInt &DemandedElts,
+ unsigned BFEWidth, bool SExt, unsigned Depth) {
+ const MachineRegisterInfo &MRI = VT.getMachineFunction().getRegInfo();
+ const MachineOperand &Src1 = MI.getOperand(2);
+
+ unsigned Src1Cst = 0;
+ if (Src1.isImm()) {
+ Src1Cst = Src1.getImm();
+ } else if (Src1.isReg()) {
+ auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
+ if (!Cst)
+ return;
+ Src1Cst = Cst->Value.getZExtValue();
+ } else {
+ return;
+ }
+
+ // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
+ // Width is always [22:16].
+ const unsigned Offset =
+ Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
+ const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
+
+ if (Width >= BFEWidth) // Ill-formed.
+ return;
+
+ VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+ Depth + 1);
+
+ Known = Known.extractBits(Width, Offset);
+
+ if (SExt)
+ Known = Known.sext(BFEWidth);
+ else
+ Known = Known.zext(BFEWidth);
+}
+
void SITargetLowering::computeKnownBitsForTargetInstr(
GISelValueTracking &VT, Register R, KnownBits &Known,
const APInt &DemandedElts, const MachineRegisterInfo &MRI,
unsigned Depth) const {
+ Known.resetAll();
const MachineInstr *MI = MRI.getVRegDef(R);
switch (MI->getOpcode()) {
+ case AMDGPU::S_BFE_I32:
+ return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
+ /*SExt=*/true, Depth);
+ case AMDGPU::S_BFE_U32:
+ return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
+ /*SExt=*/false, Depth);
+ case AMDGPU::S_BFE_I64:
+ return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
+ /*SExt=*/true, Depth);
+ case AMDGPU::S_BFE_U64:
+ return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
+ /*SExt=*/false, Depth);
case AMDGPU::G_INTRINSIC:
case AMDGPU::G_INTRINSIC_CONVERGENT: {
Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 7ce1359f03da..2af0a575a888 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -260,240 +260,7 @@ InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
llvm_unreachable("event type has no associated counter");
}
-// This objects maintains the current score brackets of each wait counter, and
-// a per-register scoreboard for each wait counter.
-//
-// We also maintain the latest score for every event type that can change the
-// waitcnt in order to know if there are multiple types of events within
-// the brackets. When multiple types of event happen in the bracket,
-// wait count may get decreased out of order, therefore we need to put in
-// "s_waitcnt 0" before use.
-class WaitcntBrackets {
-public:
- WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
- HardwareLimits Limits, const unsigned *WaitEventMaskForInst,
- InstCounterType SmemAccessCounter)
- : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
- WaitEventMaskForInst(WaitEventMaskForInst),
- SmemAccessCounter(SmemAccessCounter) {}
-
- unsigned getWaitCountMax(InstCounterType T) const {
- switch (T) {
- case LOAD_CNT:
- return Limits.LoadcntMax;
- case DS_CNT:
- return Limits.DscntMax;
- case EXP_CNT:
- return Limits.ExpcntMax;
- case STORE_CNT:
- return Limits.StorecntMax;
- case SAMPLE_CNT:
- return Limits.SamplecntMax;
- case BVH_CNT:
- return Limits.BvhcntMax;
- case KM_CNT:
- return Limits.KmcntMax;
- case X_CNT:
- return Limits.XcntMax;
- default:
- break;
- }
- return 0;
- }
-
- bool isSmemCounter(InstCounterType T) const {
- return T == SmemAccessCounter || T == X_CNT;
- }
-
- unsigned getSgprScoresIdx(InstCounterType T) const {
- assert(isSmemCounter(T) && "Invalid SMEM counter");
- return T == X_CNT ? 1 : 0;
- }
-
- unsigned getScoreLB(InstCounterType T) const {
- assert(T < NUM_INST_CNTS);
- return ScoreLBs[T];
- }
-
- unsigned getScoreUB(InstCounterType T) const {
- assert(T < NUM_INST_CNTS);
- return ScoreUBs[T];
- }
-
- unsigned getScoreRange(InstCounterType T) const {
- return getScoreUB(T) - getScoreLB(T);
- }
-
- unsigned getRegScore(int GprNo, InstCounterType T) const {
- if (GprNo < NUM_ALL_VGPRS)
- return VgprScores[T][GprNo];
- return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
- }
-
- bool merge(const WaitcntBrackets &Other);
-
- RegInterval getRegInterval(const MachineInstr *MI,
- const MachineRegisterInfo *MRI,
- const SIRegisterInfo *TRI,
- const MachineOperand &Op) const;
-
- bool counterOutOfOrder(InstCounterType T) const;
- void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
- void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
-
- void determineWait(InstCounterType T, RegInterval Interval,
- AMDGPU::Waitcnt &Wait) const;
- void determineWait(InstCounterType T, int RegNo,
- AMDGPU::Waitcnt &Wait) const {
- determineWait(T, {RegNo, RegNo + 1}, Wait);
- }
-
- void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
- void applyWaitcnt(InstCounterType T, unsigned Count);
- void applyXcnt(const AMDGPU::Waitcnt &Wait);
- void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI, WaitEventType E,
- MachineInstr &MI);
-
- unsigned hasPendingEvent() const { return PendingEvents; }
- unsigned hasPendingEvent(WaitEventType E) const {
- return PendingEvents & (1 << E);
- }
- unsigned hasPendingEvent(InstCounterType T) const {
- unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
- assert((HasPending != 0) == (getScoreRange(T) != 0));
- return HasPending;
- }
-
- bool hasMixedPendingEvents(InstCounterType T) const {
- unsigned Events = hasPendingEvent(T);
- // Return true if more than one bit is set in Events.
- return Events & (Events - 1);
- }
-
- bool hasPendingFlat() const {
- return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
- LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
- (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
- LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
- }
-
- void setPendingFlat() {
- LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
- LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
- }
-
- bool hasPendingGDS() const {
- return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
- }
-
- unsigned getPendingGDSWait() const {
- return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1);
- }
-
- void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
-
- // Return true if there might be pending writes to the vgpr-interval by VMEM
- // instructions with types different from V.
- bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- assert(RegNo < NUM_ALL_VGPRS);
- if (VgprVmemTypes[RegNo] & ~(1 << V))
- return true;
- }
- return false;
- }
-
- void clearVgprVmemTypes(RegInterval Interval) {
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- assert(RegNo < NUM_ALL_VGPRS);
- VgprVmemTypes[RegNo] = 0;
- }
- }
-
- void setStateOnFunctionEntryOrReturn() {
- setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
- PendingEvents |= WaitEventMaskForInst[STORE_CNT];
- }
-
- ArrayRef<const MachineInstr *> getLDSDMAStores() const {
- return LDSDMAStores;
- }
-
- bool hasPointSampleAccel(const MachineInstr &MI) const;
- bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
- RegInterval Interval) const;
-
- void print(raw_ostream &) const;
- void dump() const { print(dbgs()); }
-
-private:
- struct MergeInfo {
- unsigned OldLB;
- unsigned OtherLB;
- unsigned MyShift;
- unsigned OtherShift;
- };
- static bool mergeScore(const MergeInfo &M, unsigned &Score,
- unsigned OtherScore);
-
- void setScoreLB(InstCounterType T, unsigned Val) {
- assert(T < NUM_INST_CNTS);
- ScoreLBs[T] = Val;
- }
-
- void setScoreUB(InstCounterType T, unsigned Val) {
- assert(T < NUM_INST_CNTS);
- ScoreUBs[T] = Val;
-
- if (T != EXP_CNT)
- return;
-
- if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
- ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
- }
-
- void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
- setScoreByInterval({GprNo, GprNo + 1}, T, Val);
- }
-
- void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
- unsigned Score);
-
- void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI,
- const MachineOperand &Op, InstCounterType CntTy,
- unsigned Val);
-
- const GCNSubtarget *ST = nullptr;
- InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
- HardwareLimits Limits = {};
- const unsigned *WaitEventMaskForInst;
- InstCounterType SmemAccessCounter;
- unsigned ScoreLBs[NUM_INST_CNTS] = {0};
- unsigned ScoreUBs[NUM_INST_CNTS] = {0};
- unsigned PendingEvents = 0;
- // Remember the last flat memory operation.
- unsigned LastFlat[NUM_INST_CNTS] = {0};
- // Remember the last GDS operation.
- unsigned LastGDS = 0;
- // wait_cnt scores for every vgpr.
- // Keep track of the VgprUB and SgprUB to make merge at join efficient.
- int VgprUB = -1;
- int SgprUB = -1;
- unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
- // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
- // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
- // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
- // X_CNT score.
- unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
- // Bitmask of the VmemTypes of VMEM instructions that might have a pending
- // write to each vgpr.
- unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
- // Store representative LDS DMA operations. The only useful info here is
- // alias info. One store is kept per unique AAInfo.
- SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
-};
+class WaitcntBrackets;
// This abstracts the logic for generating and updating S_WAIT* instructions
// away from the analysis that determines where they are needed. This was
@@ -640,8 +407,13 @@ public:
};
class SIInsertWaitcnts {
+public:
+ const GCNSubtarget *ST;
+ InstCounterType SmemAccessCounter;
+ InstCounterType MaxCounter;
+ const unsigned *WaitEventMaskForInst;
+
private:
- const GCNSubtarget *ST = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
const MachineRegisterInfo *MRI = nullptr;
@@ -657,8 +429,6 @@ private:
bool Dirty = true;
};
- InstCounterType SmemAccessCounter;
-
MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
bool ForceEmitWaitcnt[NUM_INST_CNTS];
@@ -675,7 +445,7 @@ private:
// message.
DenseSet<MachineInstr *> ReleaseVGPRInsts;
- InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
+ HardwareLimits Limits;
public:
SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -686,6 +456,30 @@ public:
(void)ForceVMCounter;
}
+ unsigned getWaitCountMax(InstCounterType T) const {
+ switch (T) {
+ case LOAD_CNT:
+ return Limits.LoadcntMax;
+ case DS_CNT:
+ return Limits.DscntMax;
+ case EXP_CNT:
+ return Limits.ExpcntMax;
+ case STORE_CNT:
+ return Limits.StorecntMax;
+ case SAMPLE_CNT:
+ return Limits.SamplecntMax;
+ case BVH_CNT:
+ return Limits.BvhcntMax;
+ case KM_CNT:
+ return Limits.KmcntMax;
+ case X_CNT:
+ return Limits.XcntMax;
+ default:
+ break;
+ }
+ return 0;
+ }
+
bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
bool isPreheaderToFlush(MachineBasicBlock &MBB,
const WaitcntBrackets &ScoreBrackets);
@@ -791,6 +585,211 @@ public:
WaitcntBrackets &ScoreBrackets);
};
+// This objects maintains the current score brackets of each wait counter, and
+// a per-register scoreboard for each wait counter.
+//
+// We also maintain the latest score for every event type that can change the
+// waitcnt in order to know if there are multiple types of events within
+// the brackets. When multiple types of event happen in the bracket,
+// wait count may get decreased out of order, therefore we need to put in
+// "s_waitcnt 0" before use.
+class WaitcntBrackets {
+public:
+ WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {}
+
+ bool isSmemCounter(InstCounterType T) const {
+ return T == Context->SmemAccessCounter || T == X_CNT;
+ }
+
+ unsigned getSgprScoresIdx(InstCounterType T) const {
+ assert(isSmemCounter(T) && "Invalid SMEM counter");
+ return T == X_CNT ? 1 : 0;
+ }
+
+ unsigned getScoreLB(InstCounterType T) const {
+ assert(T < NUM_INST_CNTS);
+ return ScoreLBs[T];
+ }
+
+ unsigned getScoreUB(InstCounterType T) const {
+ assert(T < NUM_INST_CNTS);
+ return ScoreUBs[T];
+ }
+
+ unsigned getScoreRange(InstCounterType T) const {
+ return getScoreUB(T) - getScoreLB(T);
+ }
+
+ unsigned getRegScore(int GprNo, InstCounterType T) const {
+ if (GprNo < NUM_ALL_VGPRS)
+ return VgprScores[T][GprNo];
+ return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
+ }
+
+ bool merge(const WaitcntBrackets &Other);
+
+ RegInterval getRegInterval(const MachineInstr *MI,
+ const MachineRegisterInfo *MRI,
+ const SIRegisterInfo *TRI,
+ const MachineOperand &Op) const;
+
+ bool counterOutOfOrder(InstCounterType T) const;
+ void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+ void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+
+ void determineWait(InstCounterType T, RegInterval Interval,
+ AMDGPU::Waitcnt &Wait) const;
+ void determineWait(InstCounterType T, int RegNo,
+ AMDGPU::Waitcnt &Wait) const {
+ determineWait(T, {RegNo, RegNo + 1}, Wait);
+ }
+
+ void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
+ void applyWaitcnt(InstCounterType T, unsigned Count);
+ void applyXcnt(const AMDGPU::Waitcnt &Wait);
+ void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+ const MachineRegisterInfo *MRI, WaitEventType E,
+ MachineInstr &MI);
+
+ unsigned hasPendingEvent() const { return PendingEvents; }
+ unsigned hasPendingEvent(WaitEventType E) const {
+ return PendingEvents & (1 << E);
+ }
+ unsigned hasPendingEvent(InstCounterType T) const {
+ unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
+ assert((HasPending != 0) == (getScoreRange(T) != 0));
+ return HasPending;
+ }
+
+ bool hasMixedPendingEvents(InstCounterType T) const {
+ unsigned Events = hasPendingEvent(T);
+ // Return true if more than one bit is set in Events.
+ return Events & (Events - 1);
+ }
+
+ bool hasPendingFlat() const {
+ return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
+ LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
+ (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
+ LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
+ }
+
+ void setPendingFlat() {
+ LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
+ LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
+ }
+
+ bool hasPendingGDS() const {
+ return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
+ }
+
+ unsigned getPendingGDSWait() const {
+ return std::min(getScoreUB(DS_CNT) - LastGDS,
+ Context->getWaitCountMax(DS_CNT) - 1);
+ }
+
+ void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
+
+ // Return true if there might be pending writes to the vgpr-interval by VMEM
+ // instructions with types different from V.
+ bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ assert(RegNo < NUM_ALL_VGPRS);
+ if (VgprVmemTypes[RegNo] & ~(1 << V))
+ return true;
+ }
+ return false;
+ }
+
+ void clearVgprVmemTypes(RegInterval Interval) {
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ assert(RegNo < NUM_ALL_VGPRS);
+ VgprVmemTypes[RegNo] = 0;
+ }
+ }
+
+ void setStateOnFunctionEntryOrReturn() {
+ setScoreUB(STORE_CNT,
+ getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
+ PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
+ }
+
+ ArrayRef<const MachineInstr *> getLDSDMAStores() const {
+ return LDSDMAStores;
+ }
+
+ bool hasPointSampleAccel(const MachineInstr &MI) const;
+ bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
+ RegInterval Interval) const;
+
+ void print(raw_ostream &) const;
+ void dump() const { print(dbgs()); }
+
+private:
+ struct MergeInfo {
+ unsigned OldLB;
+ unsigned OtherLB;
+ unsigned MyShift;
+ unsigned OtherShift;
+ };
+ static bool mergeScore(const MergeInfo &M, unsigned &Score,
+ unsigned OtherScore);
+
+ void setScoreLB(InstCounterType T, unsigned Val) {
+ assert(T < NUM_INST_CNTS);
+ ScoreLBs[T] = Val;
+ }
+
+ void setScoreUB(InstCounterType T, unsigned Val) {
+ assert(T < NUM_INST_CNTS);
+ ScoreUBs[T] = Val;
+
+ if (T != EXP_CNT)
+ return;
+
+ if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
+ ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
+ }
+
+ void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
+ setScoreByInterval({GprNo, GprNo + 1}, T, Val);
+ }
+
+ void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
+ unsigned Score);
+
+ void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
+ const MachineRegisterInfo *MRI,
+ const MachineOperand &Op, InstCounterType CntTy,
+ unsigned Val);
+
+ const SIInsertWaitcnts *Context;
+
+ unsigned ScoreLBs[NUM_INST_CNTS] = {0};
+ unsigned ScoreUBs[NUM_INST_CNTS] = {0};
+ unsigned PendingEvents = 0;
+ // Remember the last flat memory operation.
+ unsigned LastFlat[NUM_INST_CNTS] = {0};
+ // Remember the last GDS operation.
+ unsigned LastGDS = 0;
+ // wait_cnt scores for every vgpr.
+ // Keep track of the VgprUB and SgprUB to make merge at join efficient.
+ int VgprUB = -1;
+ int SgprUB = -1;
+ unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
+ // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+ // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
+ // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
+ // X_CNT score.
+ unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
+ // Bitmask of the VmemTypes of VMEM instructions that might have a pending
+ // write to each vgpr.
+ unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
+ // Store representative LDS DMA operations. The only useful info here is
+ // alias info. One store is kept per unique AAInfo.
+ SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
+};
+
class SIInsertWaitcntsLegacy : public MachineFunctionPass {
public:
static char ID;
@@ -827,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
RegInterval Result;
- MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST);
+ MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
unsigned RegIdx = TRI->getHWRegIndex(MCReg);
assert(isUInt<8>(RegIdx));
@@ -885,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
// this at compile time, so we have to assume it might be applied if the
// instruction supports it).
bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
- if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
+ if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
return false;
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
@@ -911,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI,
WaitEventType E, MachineInstr &Inst) {
- InstCounterType T = eventCounter(WaitEventMaskForInst, E);
+ InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
unsigned UB = getScoreUB(T);
unsigned CurrScore = UB + 1;
@@ -1080,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
void WaitcntBrackets::print(raw_ostream &OS) const {
+ const GCNSubtarget *ST = Context->ST;
+
OS << '\n';
- for (auto T : inst_counter_types(MaxCounter)) {
+ for (auto T : inst_counter_types(Context->MaxCounter)) {
unsigned SR = getScoreRange(T);
switch (T) {
@@ -1195,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
// s_waitcnt instruction.
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
- !ST->hasFlatLgkmVMemCountInOrder()) {
+ !Context->ST->hasFlatLgkmVMemCountInOrder()) {
// If there is a pending FLAT operation, and this is a VMem or LGKM
// waitcnt and the target can report early completion, then we need
// to force a waitcnt 0.
@@ -1209,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
// If a counter has been maxed out avoid overflow by waiting for
// MAX(CounterType) - 1 instead.
unsigned NeededWait =
- std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
+ std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
addWait(Wait, T, NeededWait);
}
}
@@ -1237,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
setScoreLB(T, std::max(getScoreLB(T), UB - Count));
} else {
setScoreLB(T, UB);
- PendingEvents &= ~WaitEventMaskForInst[T];
+ PendingEvents &= ~Context->WaitEventMaskForInst[T];
}
}
@@ -1262,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
// the decrement may go out of order.
bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
// Scalar memory read always can go out of order.
- if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
+ if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
(T == X_CNT && hasPendingEvent(SMEM_GROUP)))
return true;
return hasMixedPendingEvents(T);
@@ -2386,8 +2387,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
VgprUB = std::max(VgprUB, Other.VgprUB);
SgprUB = std::max(SgprUB, Other.SgprUB);
- for (auto T : inst_counter_types(MaxCounter)) {
+ for (auto T : inst_counter_types(Context->MaxCounter)) {
// Merge event flags for this counter
+ const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
if (OtherEvents & ~OldEvents)
@@ -2746,11 +2748,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
for (auto T : inst_counter_types())
ForceEmitWaitcnt[T] = false;
- const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
+ WaitEventMaskForInst = WCG->getWaitEventMask();
SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
- HardwareLimits Limits = {};
if (ST->hasExtendedWaitCounts()) {
Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
@@ -2807,8 +2808,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
}
- auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+ auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
NonKernelInitialState->setStateOnFunctionEntryOrReturn();
BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
@@ -2839,15 +2839,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
*Brackets = *BI.Incoming;
} else {
if (!Brackets) {
- Brackets = std::make_unique<WaitcntBrackets>(
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+ Brackets = std::make_unique<WaitcntBrackets>(this);
} else {
// Reinitialize in-place. N.B. do not do this by assigning from a
// temporary because the WaitcntBrackets class is large and it could
// cause this function to use an unreasonable amount of stack space.
Brackets->~WaitcntBrackets();
- new (Brackets.get()) WaitcntBrackets(
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+ new (Brackets.get()) WaitcntBrackets(this);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ca3af3b48a60..c8935f0cb603 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -687,7 +687,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
if (!SafeToPropagate)
break;
- DefOp.setIsKill(false);
+ for (auto I = Def; I != MI; ++I)
+ I->clearRegisterKills(DefOp.getReg(), &RI);
}
MachineInstrBuilder Builder =
@@ -1625,41 +1626,6 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
}
}
-static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
- switch (Size) {
- case 4:
- return AMDGPU::SI_SPILL_A32_SAVE;
- case 8:
- return AMDGPU::SI_SPILL_A64_SAVE;
- case 12:
- return AMDGPU::SI_SPILL_A96_SAVE;
- case 16:
- return AMDGPU::SI_SPILL_A128_SAVE;
- case 20:
- return AMDGPU::SI_SPILL_A160_SAVE;
- case 24:
- return AMDGPU::SI_SPILL_A192_SAVE;
- case 28:
- return AMDGPU::SI_SPILL_A224_SAVE;
- case 32:
- return AMDGPU::SI_SPILL_A256_SAVE;
- case 36:
- return AMDGPU::SI_SPILL_A288_SAVE;
- case 40:
- return AMDGPU::SI_SPILL_A320_SAVE;
- case 44:
- return AMDGPU::SI_SPILL_A352_SAVE;
- case 48:
- return AMDGPU::SI_SPILL_A384_SAVE;
- case 64:
- return AMDGPU::SI_SPILL_A512_SAVE;
- case 128:
- return AMDGPU::SI_SPILL_A1024_SAVE;
- default:
- llvm_unreachable("unknown register size");
- }
-}
-
static unsigned getAVSpillSaveOpcode(unsigned Size) {
switch (Size) {
case 4:
@@ -1707,22 +1673,20 @@ static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
return AMDGPU::SI_SPILL_WWM_V32_SAVE;
}
-static unsigned getVectorRegSpillSaveOpcode(Register Reg,
- const TargetRegisterClass *RC,
- unsigned Size,
- const SIRegisterInfo &TRI,
- const SIMachineFunctionInfo &MFI) {
- bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
+unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
+ Register Reg, const TargetRegisterClass *RC, unsigned Size,
+ const SIMachineFunctionInfo &MFI) const {
+ bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
// Choose the right opcode if spilling a WWM register.
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
- if (IsVectorSuperClass)
+ // TODO: Check if AGPRs are available
+ if (ST.hasMAIInsts())
return getAVSpillSaveOpcode(Size);
- return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
- : getVGPRSpillSaveOpcode(Size);
+ return getVGPRSpillSaveOpcode(Size);
}
void SIInstrInfo::storeRegToStackSlot(
@@ -1770,8 +1734,8 @@ void SIInstrInfo::storeRegToStackSlot(
return;
}
- unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
- SpillSize, RI, *MFI);
+ unsigned Opcode =
+ getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
MFI->setHasSpilledVGPRs();
BuildMI(MBB, MI, DL, get(Opcode))
@@ -1854,41 +1818,6 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
}
}
-static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
- switch (Size) {
- case 4:
- return AMDGPU::SI_SPILL_A32_RESTORE;
- case 8:
- return AMDGPU::SI_SPILL_A64_RESTORE;
- case 12:
- return AMDGPU::SI_SPILL_A96_RESTORE;
- case 16:
- return AMDGPU::SI_SPILL_A128_RESTORE;
- case 20:
- return AMDGPU::SI_SPILL_A160_RESTORE;
- case 24:
- return AMDGPU::SI_SPILL_A192_RESTORE;
- case 28:
- return AMDGPU::SI_SPILL_A224_RESTORE;
- case 32:
- return AMDGPU::SI_SPILL_A256_RESTORE;
- case 36:
- return AMDGPU::SI_SPILL_A288_RESTORE;
- case 40:
- return AMDGPU::SI_SPILL_A320_RESTORE;
- case 44:
- return AMDGPU::SI_SPILL_A352_RESTORE;
- case 48:
- return AMDGPU::SI_SPILL_A384_RESTORE;
- case 64:
- return AMDGPU::SI_SPILL_A512_RESTORE;
- case 128:
- return AMDGPU::SI_SPILL_A1024_RESTORE;
- default:
- llvm_unreachable("unknown register size");
- }
-}
-
static unsigned getAVSpillRestoreOpcode(unsigned Size) {
switch (Size) {
case 4:
@@ -1930,27 +1859,27 @@ static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
if (Size != 4)
llvm_unreachable("unknown wwm register spill size");
- if (IsVectorSuperClass)
+ if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
}
-static unsigned
-getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
- unsigned Size, const SIRegisterInfo &TRI,
- const SIMachineFunctionInfo &MFI) {
- bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
+unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(
+ Register Reg, const TargetRegisterClass *RC, unsigned Size,
+ const SIMachineFunctionInfo &MFI) const {
+ bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
// Choose the right opcode if restoring a WWM register.
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
- if (IsVectorSuperClass)
+ // TODO: Check if AGPRs are available
+ if (ST.hasMAIInsts())
return getAVSpillRestoreOpcode(Size);
- return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
- : getVGPRSpillRestoreOpcode(Size);
+ assert(!RI.isAGPRClass(RC));
+ return getVGPRSpillRestoreOpcode(Size);
}
void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -1998,7 +1927,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
}
unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
- SpillSize, RI, *MFI);
+ SpillSize, *MFI);
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addFrameIndex(FrameIndex) // vaddr
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
@@ -2214,7 +2143,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
if (ST.hasMovB64()) {
MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
- isUInt<32>(SrcOp.getImm()))
+ isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
break;
}
if (SrcOp.isImm()) {
@@ -2273,6 +2202,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
const MachineOperand &SrcOp = MI.getOperand(1);
assert(!SrcOp.isFPImm());
+
+ if (ST.has64BitLiterals()) {
+ MI.setDesc(get(AMDGPU::S_MOV_B64));
+ break;
+ }
+
APInt Imm(64, SrcOp.getImm());
if (Imm.isIntN(32) || isInlineConstant(Imm)) {
MI.setDesc(get(AMDGPU::S_MOV_B64));
@@ -2492,6 +2427,25 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
+ case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
+ MachineFunction &MF = *MBB.getParent();
+ Register Reg = MI.getOperand(0).getReg();
+ MachineOperand Op = MI.getOperand(1);
+
+ // Create a bundle so these instructions won't be re-ordered by the
+ // post-RA scheduler.
+ MIBundleBuilder Bundler(MBB, MI);
+ Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
+ if (Op.isGlobal())
+ Op.setOffset(Op.getOffset() + 4);
+ Bundler.append(
+ BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
+
+ finalizeBundle(MBB, Bundler.begin());
+
+ MI.eraseFromParent();
+ break;
+ }
case AMDGPU::ENTER_STRICT_WWM: {
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
// Whole Wave Mode is entered.
@@ -2807,12 +2761,14 @@ bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
if ((int)OpIdx1 != Src0Idx && MO0->isReg()) {
if (!DefinedRC1)
return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
- return isLegalRegOperand(MI, OpIdx1, *MO0);
+ return isLegalRegOperand(MI, OpIdx1, *MO0) &&
+ (!MO1->isReg() || isLegalRegOperand(MI, OpIdx0, *MO1));
}
if ((int)OpIdx0 != Src0Idx && MO1->isReg()) {
if (!DefinedRC0)
return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
- return isLegalRegOperand(MI, OpIdx0, *MO1);
+ return (!MO0->isReg() || isLegalRegOperand(MI, OpIdx1, *MO0)) &&
+ isLegalRegOperand(MI, OpIdx0, *MO1);
}
// No need to check 64-bit literals since swapping does not bring new
@@ -2903,9 +2859,9 @@ bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc,
bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
int64_t BrOffset) const {
- // BranchRelaxation should never have to check s_setpc_b64 because its dest
- // block is unanalyzable.
- assert(BranchOp != AMDGPU::S_SETPC_B64);
+ // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
+ // because its dest block is unanalyzable.
+ assert(isSOPP(BranchOp) || isSOPK(BranchOp));
// Convert to dwords.
BrOffset /= 4;
@@ -2946,13 +2902,30 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ auto I = MBB.end();
+ auto &MCCtx = MF->getContext();
+
+ if (ST.hasAddPC64Inst()) {
+ MCSymbol *Offset =
+ MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
+ auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
+ .addSym(Offset, MO_FAR_BRANCH_OFFSET);
+ MCSymbol *PostAddPCLabel =
+ MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
+ AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
+ auto *OffsetExpr = MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
+ MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
+ Offset->setVariableValue(OffsetExpr);
+ return;
+ }
+
+ assert(RS && "RegScavenger required for long branching");
// FIXME: Virtual register workaround for RegScavenger not working with empty
// blocks.
Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- auto I = MBB.end();
-
// Note: as this is used after hazard recognizer we need to apply some hazard
// workarounds directly.
const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
@@ -2968,7 +2941,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
ApplyHazardWorkarounds();
- auto &MCCtx = MF->getContext();
MCSymbol *PostGetPCLabel =
MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
@@ -3507,6 +3479,10 @@ static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
? AMDGPU::V_FMAAK_F16_t16
: AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16;
+ case AMDGPU::V_FMAC_F64_e32:
+ case AMDGPU::V_FMAC_F64_e64:
+ case AMDGPU::V_FMA_F64_e64:
+ return AMDGPU::V_FMAAK_F64;
default:
llvm_unreachable("invalid instruction");
}
@@ -3535,6 +3511,10 @@ static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
? AMDGPU::V_FMAMK_F16_t16
: AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16;
+ case AMDGPU::V_FMAC_F64_e32:
+ case AMDGPU::V_FMAC_F64_e64:
+ case AMDGPU::V_FMA_F64_e64:
+ return AMDGPU::V_FMAMK_F64;
default:
llvm_unreachable("invalid instruction");
}
@@ -3613,7 +3593,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
- Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
+ Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
+ Opc == AMDGPU::V_FMAC_F64_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
if (hasAnyModifiersSet(UseMI))
@@ -3685,7 +3666,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
- Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
+ Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3753,7 +3735,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
- Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
+ Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -4074,8 +4057,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
- if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
- !IsLegacy &&
+ if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
+ (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
// If we have an SGPR input, we will violate the constant bus restriction.
(ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
!RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
@@ -6099,14 +6082,18 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
if (Is64BitOp &&
!AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
- if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
+ if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
+ (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
return false;
// FIXME: We can use sign extended 64-bit literals, but only for signed
// operands. At the moment we do not know if an operand is signed.
// Such operand will be encoded as its low 32 bits and then either
// correctly sign extended or incorrectly zero extended by HW.
- if (!Is64BitFPOp && (int32_t)Imm < 0)
+ // If 64-bit literals are supported and the literal will be encoded
+ // as full 64 bit we still can use it.
+ if (!Is64BitFPOp && (int32_t)Imm < 0 &&
+ (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
return false;
}
}
@@ -6402,7 +6389,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
if (OldSAddrIdx < 0)
return false;
- assert(isSegmentSpecificFLAT(Inst));
+ assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
if (NewOpc < 0)
@@ -6426,7 +6413,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
if (OldVAddrIdx >= 0) {
MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
- if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
+ if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
!VAddrDef->getOperand(1).isImm() ||
VAddrDef->getOperand(1).getImm() != 0)
return false;
@@ -6479,7 +6466,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
// FIXME: Remove this when SelectionDAG is obsoleted.
void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
MachineInstr &MI) const {
- if (!isSegmentSpecificFLAT(MI))
+ if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
return;
// Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
@@ -9178,15 +9165,30 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
if (isDPP(MI))
return DescSize;
bool HasLiteral = false;
+ unsigned LiteralSize = 4;
for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
const MachineOperand &Op = MI.getOperand(I);
const MCOperandInfo &OpInfo = Desc.operands()[I];
if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
HasLiteral = true;
+ if (ST.has64BitLiterals()) {
+ switch (OpInfo.OperandType) {
+ default:
+ break;
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
+ LiteralSize = 8;
+ break;
+ case AMDGPU::OPERAND_REG_IMM_INT64:
+ if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
+ LiteralSize = 8;
+ break;
+ }
+ }
break;
}
}
- return HasLiteral ? DescSize + 4 : DescSize;
+ return HasLiteral ? DescSize + LiteralSize : DescSize;
}
// Check whether we have extra NSA words.
@@ -9277,13 +9279,16 @@ SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
ArrayRef<std::pair<unsigned, const char *>>
SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
static const std::pair<unsigned, const char *> TargetFlags[] = {
- { MO_GOTPCREL, "amdgpu-gotprel" },
- { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
- { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
- { MO_REL32_LO, "amdgpu-rel32-lo" },
- { MO_REL32_HI, "amdgpu-rel32-hi" },
- { MO_ABS32_LO, "amdgpu-abs32-lo" },
- { MO_ABS32_HI, "amdgpu-abs32-hi" },
+ {MO_GOTPCREL, "amdgpu-gotprel"},
+ {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
+ {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
+ {MO_GOTPCREL64, "amdgpu-gotprel64"},
+ {MO_REL32_LO, "amdgpu-rel32-lo"},
+ {MO_REL32_HI, "amdgpu-rel32-hi"},
+ {MO_REL64, "amdgpu-rel64"},
+ {MO_ABS32_LO, "amdgpu-abs32-lo"},
+ {MO_ABS32_HI, "amdgpu-abs32-hi"},
+ {MO_ABS64, "amdgpu-abs64"},
};
return ArrayRef(TargetFlags);
@@ -10390,10 +10395,23 @@ bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
return TargetInstrInfo::isGlobalMemoryObject(MI);
}
+bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const {
+ if (!isWMMA(MI) && !isSWMMAC(MI))
+ return false;
+
+ if (AMDGPU::isGFX1250(ST))
+ return AMDGPU::getWMMAIsXDL(MI.getOpcode());
+
+ return true;
+}
+
bool SIInstrInfo::isXDL(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
- if (!SIInstrInfo::isMAI(MI) || isDGEMM(Opcode) ||
+ if (AMDGPU::isGFX12Plus(ST))
+ return isDOT(MI) || isXDLWMMA(MI);
+
+ if (!isMAI(MI) || isDGEMM(Opcode) ||
Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 9e84822bfc27..5e92921f3ea2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -33,6 +33,7 @@ class LiveVariables;
class MachineDominatorTree;
class MachineRegisterInfo;
class RegScavenger;
+class SIMachineFunctionInfo;
class TargetRegisterClass;
class ScheduleHazardRecognizer;
@@ -214,16 +215,20 @@ public:
MO_GOTPCREL32_LO = 2,
// MO_GOTPCREL32_HI -> symbol@gotpcrel32@hi -> R_AMDGPU_GOTPCREL32_HI.
MO_GOTPCREL32_HI = 3,
+ // MO_GOTPCREL64 -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL.
+ MO_GOTPCREL64 = 4,
// MO_REL32_LO -> symbol@rel32@lo -> R_AMDGPU_REL32_LO.
- MO_REL32 = 4,
- MO_REL32_LO = 4,
+ MO_REL32 = 5,
+ MO_REL32_LO = 5,
// MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI.
- MO_REL32_HI = 5,
+ MO_REL32_HI = 6,
+ MO_REL64 = 7,
- MO_FAR_BRANCH_OFFSET = 6,
+ MO_FAR_BRANCH_OFFSET = 8,
- MO_ABS32_LO = 8,
- MO_ABS32_HI = 9,
+ MO_ABS32_LO = 9,
+ MO_ABS32_HI = 10,
+ MO_ABS64 = 11,
};
explicit SIInstrInfo(const GCNSubtarget &ST);
@@ -283,6 +288,15 @@ public:
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
int64_t &ImmVal) const override;
+ unsigned getVectorRegSpillSaveOpcode(Register Reg,
+ const TargetRegisterClass *RC,
+ unsigned Size,
+ const SIMachineFunctionInfo &MFI) const;
+ unsigned
+ getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
+ unsigned Size,
+ const SIMachineFunctionInfo &MFI) const;
+
void storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
bool isKill, int FrameIndex, const TargetRegisterClass *RC,
@@ -863,6 +877,8 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
}
+ bool isXDLWMMA(const MachineInstr &MI) const;
+
bool isXDL(const MachineInstr &MI) const;
static bool isDGEMM(unsigned Opcode) { return AMDGPU::getMAIIsDGEMM(Opcode); }
@@ -1097,7 +1113,6 @@ public:
// that will not require an additional 4-bytes; this function assumes that it
// will.
bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const {
- assert(!MO.isReg() && "isInlineConstant called on register operand!");
if (!MO.isImm())
return false;
return isInlineConstant(MO.getImm(), OperandType);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 5e41f875d980..9e1951e2946c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -268,6 +268,10 @@ def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
>;
+def SIpc_add_rel_offset64 : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET64",
+ SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
+>;
+
def SIlds : SDNode<"AMDGPUISD::LDS",
SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
>;
@@ -1247,6 +1251,7 @@ def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">;
def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">;
def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">;
+def IndexKey32bit : CustomOperand<i32, 1>;
def IndexKey16bit : CustomOperand<i32, 1>;
def IndexKey8bit : CustomOperand<i32, 1>;
@@ -1302,6 +1307,9 @@ let PrintMethod = "printBitOp3" in
def BitOp3 : NamedIntOperand<"bitop3">;
def bitop3_0 : DefaultOperand<BitOp3, 0>;
+def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">;
+def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">;
+
class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
let OperandNamespace = "AMDGPU";
let OperandType = "OPERAND_KIMM"#vt.Size;
@@ -1633,6 +1641,8 @@ def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
def VOP3PModsNeg : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">;
+def VOP3PModsNegs : ComplexPattern<untyped, 1, "SelectVOP3PModsNegs">; // chfang: not use complex pattern?
+def VOP3PModsNegAbs : ComplexPattern<untyped, 1, "SelectVOP3PModsNegAbs">;
def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
def WMMAModsF32NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">;
@@ -1641,6 +1651,7 @@ def WMMAModsF16NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF16NegAbs">;
def WMMAVISrc : ComplexPattern<untyped, 1, "SelectWMMAVISrc">;
def SWMMACIndex8 : ComplexPattern<untyped, 2, "SelectSWMMACIndex8">;
def SWMMACIndex16 : ComplexPattern<untyped, 2, "SelectSWMMACIndex16">;
+def SWMMACIndex32 : ComplexPattern<untyped, 2, "SelectSWMMACIndex32">;
def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
@@ -2654,6 +2665,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
isModifierType<Src2VT>.ret,
HasOMod);
field bit HasNeg = HasModifiers;
+ field bit HasMatrixReuse = 0;
field bit HasSrc0Mods = HasModifiers;
field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0);
@@ -2837,6 +2849,8 @@ def VOP_F16_F16 : VOPProfile<[f16, f16, untyped, untyped]>;
def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>;
+def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>;
+def VOP1_I16_I32 : VOPProfile<[i16, i32, untyped, untyped]>;
def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 4419ce00b473..991d9f83e92e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1144,6 +1144,14 @@ def : GCNPat <
(SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0))
>;
+def SI_PC_ADD_REL_OFFSET64 : SPseudoInstSI <
+ (outs SReg_64:$dst),
+ (ins si_ga:$ptr),
+ [(set SReg_64:$dst,
+ (i64 (SIpc_add_rel_offset64 tglobaladdr:$ptr)))]> {
+ let SubtargetPredicate = Has64BitLiterals;
+}
+
def : GCNPat<
(AMDGPUtrap timm:$trapid),
(S_TRAP $trapid)
@@ -2465,7 +2473,6 @@ def : AMDGPUPat <
>;
let True16Predicate = NotHasTrue16BitInsts in {
-let SubtargetPredicate = isNotGFX9Plus in {
def : ROTRPattern <V_ALIGNBIT_B32_e64>;
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
@@ -2475,35 +2482,6 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
-} // isNotGFX9Plus
-
-let SubtargetPredicate = isGFX9GFX10 in {
-def : GCNPat <
- (rotr i32:$src0, i32:$src1),
- (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src0,
- /* src2_modifiers */ 0,
- $src1, /* clamp */ 0, /* op_sel */ 0)
->;
-
-foreach pat = [(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
- (i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
-def : GCNPat<pat,
- (V_ALIGNBIT_B32_opsel_e64 0, /* src0_modifiers */
- (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
- 0, /* src1_modifiers */
- (i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
- 0, /* src2_modifiers */
- $src1, /* clamp */ 0, /* op_sel */ 0)
->;
-
-def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
- (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src1,
- /* src2_modifiers */ 0,
- $src2, /* clamp */ 0, /* op_sel */ 0)
->;
-} // isGFX9GFX10
} // end True16Predicate = NotHasTrue16BitInsts
let True16Predicate = UseRealTrue16Insts in {
@@ -3104,8 +3082,6 @@ def : GCNPat <
(i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
>;
-// This pattern for bswap is used for pre-GFX8. For GFX8+, bswap is mapped
-// to V_PERM_B32.
let True16Predicate = NotHasTrue16BitInsts in
def : GCNPat <
(i32 (bswap i32:$a)),
@@ -3451,30 +3427,32 @@ def : GCNPat <
(S_LSHL_B32 SReg_32:$src1, (i16 16))
>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))),
(v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1))
>;
-
def : GCNPat <
- (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
- (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
+ (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
+ (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
>;
def : GCNPat <
- (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
- (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
+ (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
+ (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
>;
+}
def : GCNPat <
- (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
>;
def : GCNPat <
- (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
- (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
+ (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
>;
foreach vecTy = [v2i16, v2f16, v2bf16] in {
@@ -3581,20 +3559,15 @@ def : GCNPat <
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
// Special case, can use V_ALIGNBIT (always uses encoded literal)
-let True16Predicate = NotHasTrue16BitInsts in {
-defvar BuildVectorToAlignBitPat =
+let True16Predicate = NotHasTrue16BitInsts in
+def : GCNPat <
(vecTy (DivergentBinFrag<build_vector>
(Ty !if(!eq(Ty, i16),
(Ty (trunc (srl VGPR_32:$a, (i32 16)))),
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))),
- (Ty VGPR_32:$b)));
-
-let SubtargetPredicate = isNotGFX9Plus in
-def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))>;
-
-let SubtargetPredicate = isGFX9GFX10 in
-def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_opsel_e64 0, VGPR_32:$b, 0, VGPR_32:$a, 0, (i32 16), 0, 0)>;
-} //True16Predicate = NotHasTrue16BitInsts
+ (Ty VGPR_32:$b))),
+ (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))
+>;
let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index b0d6fd95cd27..5097ac03954d 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2225,8 +2225,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator MBBI = MI.getIterator();
++MBBI;
- const SITargetLowering *TLI =
- static_cast<const SITargetLowering *>(STM->getTargetLowering());
+ const SITargetLowering *TLI = STM->getTargetLowering();
for ( ; MBBI != E; ++MBBI) {
MachineInstr &MINext = *MBBI;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 67ad28661da4..75ce67c00228 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -42,7 +42,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
- const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
+ const GCNSubtarget &ST = *STI;
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9173041a7bcc..fa2b8db6ba55 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -4052,11 +4052,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
return 0;
}
-unsigned
-SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
- const TargetRegisterClass &RC) const {
+unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
+ const TargetRegisterClass &RC,
+ bool IncludeCalls) const {
for (MCPhysReg Reg : reverse(RC.getRegisters()))
- if (MRI.isPhysRegUsed(Reg))
+ if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
return getHWRegIndex(Reg) + 1;
return 0;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 06a7a17b0246..0008e5f8cf3b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -486,9 +486,11 @@ public:
unsigned SubReg) const;
// \returns a number of registers of a given \p RC used in a function.
- // Does not go inside function calls.
+ // Does not go inside function calls. If \p IncludeCalls is true, it will
+ // include registers that may be clobbered by calls.
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
- const TargetRegisterClass &RC) const;
+ const TargetRegisterClass &RC,
+ bool IncludeCalls = true) const;
std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index d24c301fc1e5..c194e5c255d4 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1294,6 +1294,7 @@ def VISrc_256_f32 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP32">;
def VISrc_256_f64 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP64">;
def VISrc_512_b32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_INT32">;
def VISrc_512_f32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_512_f64 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP64">;
def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_INT32">;
def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_FP32">;
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index 1679cee32006..ef8faffa5f55 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -66,6 +66,13 @@ def Write4PassDGEMM : SchedWrite;
def Write8PassDGEMM : SchedWrite;
def Write16PassDGEMM : SchedWrite;
+// WMMA/SWMMA instructions
+def WriteXDL2PassWMMA : SchedWrite;
+def WriteXDL4PassWMMA : SchedWrite;
+def Write4PassWMMA : SchedWrite;
+def Write8PassWMMA : SchedWrite;
+def Write16PassWMMA : SchedWrite;
+
// Scalar float instructions
def WriteSFPU : SchedWrite;
@@ -459,6 +466,15 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
multiclass GFX125xCommonWriteRes {
+let ReleaseAtCycles = [8] in
+def : HWWriteRes<WriteXDL2PassWMMA, [HWXDL], 8>;
+let ReleaseAtCycles = [16] in
+def : HWWriteRes<WriteXDL4PassWMMA, [HWXDL], 16>;
+
+def : HWWriteRes<Write4PassWMMA, [HWVALU], 16>;
+def : HWWriteRes<Write8PassWMMA, [HWVALU], 32>;
+def : HWWriteRes<Write16PassWMMA, [HWVALU], 64>;
+
def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteTrans32, [HWTransVALU, HWRC], 7>;
@@ -476,6 +492,11 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
def : InstRW<[WriteCopy], (instrs COPY)>;
+
+def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(FP8|BF8|BF16|F16)_w32")>;
+def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)_w32")>;
+def : InstRW<[Write4PassWMMA], (instregex "^V_WMMA_F32_16X16X4_F32_w32")>;
+def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>;
} // End GFX125xCommonWriteRes
let SchedModel = GFX1250SpeedModel in {
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index fd39b8a1350c..7a519117f248 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -463,6 +463,10 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
NewOpcode = AMDGPU::V_FMAAK_F16_fake16;
break;
+ case AMDGPU::V_FMA_F64_e64:
+ if (ST->hasFmaakFmamkF64Insts())
+ NewOpcode = AMDGPU::V_FMAAK_F64;
+ break;
}
}
@@ -497,6 +501,10 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
NewOpcode = AMDGPU::V_FMAMK_F16_fake16;
break;
+ case AMDGPU::V_FMA_F64_e64:
+ if (ST->hasFmaakFmamkF64Insts())
+ NewOpcode = AMDGPU::V_FMAMK_F64;
+ break;
}
}
@@ -961,7 +969,9 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 ||
- MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
+ MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64 ||
+ (MI.getOpcode() == AMDGPU::V_FMA_F64_e64 &&
+ ST->hasFmaakFmamkF64Insts())) {
shrinkMadFma(MI);
continue;
}
@@ -1058,7 +1068,11 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
// fold an immediate into the shrunk instruction as a literal operand. In
// GFX10 VOP3 instructions can take a literal operand anyway, so there is
// no advantage to doing this.
- if (ST->hasVOP3Literal() && !IsPostRA)
+ // However, if 64-bit literals are allowed we still need to shrink it
+ // for such literal to be able to fold.
+ if (ST->hasVOP3Literal() &&
+ (!ST->has64BitLiterals() || AMDGPU::isTrue16Inst(MI.getOpcode())) &&
+ !IsPostRA)
continue;
if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) &&
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 2472b76fcf02..e103ccc2f00e 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -154,6 +154,10 @@ class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
let has_sdst = 0;
}
+class SOP1_1_REGIMM64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs), (ins SSrc_b64:$src0), "$src0", pattern> {
+ let has_sdst = 0;
+}
class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
(ops node:$src0),
@@ -317,6 +321,9 @@ let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in {
let isBranch = 1, isIndirectBranch = 1 in {
def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">;
+
+let SubtargetPredicate = HasAddPC64Inst in
+def S_ADD_PC_I64 : SOP1_1_REGIMM64 <"s_add_pc_i64">;
} // End isBranch = 1, isIndirectBranch = 1
let isReturn = 1 in {
@@ -2130,6 +2137,9 @@ defm S_GET_BARRIER_STATE_IMM : SOP1_IMM_Real_gfx12<0x050>;
defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>;
defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>;
+// GFX1250
+defm S_ADD_PC_I64 : SOP1_Real_gfx12<0x04b>;
+
//===----------------------------------------------------------------------===//
// SOP1 - GFX1150, GFX12
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index a32078cc403e..77258810dd68 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -296,6 +296,7 @@ unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) {
#define GET_MIMGOffsetMappingTable_IMPL
#define GET_MIMGG16MappingTable_IMPL
#define GET_MAIInstInfoTable_IMPL
+#define GET_WMMAInstInfoTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -568,6 +569,11 @@ bool getMAIIsGFX940XDL(unsigned Opc) {
return Info && Info->is_gfx940_xdl;
}
+bool getWMMAIsXDL(unsigned Opc) {
+ const WMMAInstInfo *Info = getWMMAInstInfoHelper(Opc);
+ return Info ? Info->is_wmma_xdl : false;
+}
+
uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) {
switch (EncodingVal) {
case MFMAScaleFormats::FP6_E2M3:
@@ -639,6 +645,7 @@ bool isMAC(unsigned Opc) {
Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 ||
Opc == AMDGPU::V_MAC_F16_e64_vi ||
Opc == AMDGPU::V_FMAC_F64_e64_gfx90a ||
+ Opc == AMDGPU::V_FMAC_F64_e64_gfx12 ||
Opc == AMDGPU::V_FMAC_F32_e64_gfx10 ||
Opc == AMDGPU::V_FMAC_F32_e64_gfx11 ||
Opc == AMDGPU::V_FMAC_F32_e64_gfx12 ||
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 6708e0a3f454..c9d2c286bf23 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -119,6 +119,11 @@ struct True16D16Info {
unsigned LoOp;
};
+struct WMMAInstInfo {
+ uint16_t Opcode;
+ bool is_wmma_xdl;
+};
+
#define GET_MIMGBaseOpcode_DECL
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
@@ -129,6 +134,7 @@ struct True16D16Info {
#define GET_isMFMA_F8F6F4Table_DECL
#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
#define GET_True16D16Table_DECL
+#define GET_WMMAInstInfoTable_DECL
#include "AMDGPUGenSearchableTables.inc"
namespace IsaInfo {
@@ -593,6 +599,9 @@ bool getMAIIsDGEMM(unsigned Opc);
LLVM_READONLY
bool getMAIIsGFX940XDL(unsigned Opc);
+LLVM_READONLY
+bool getWMMAIsXDL(unsigned Opc);
+
// Get an equivalent BitOp3 for a binary logical \p Opc.
// \returns BitOp3 modifier for the logical operation or zero.
// Used in VOPD3 conversion.
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 211112e5262a..f621f8581f77 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>;
let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
+
+let SubtargetPredicate = HasTanhInsts in
+defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
@@ -526,6 +529,21 @@ defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, AMDGPUlogf16>;
defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>;
defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
+
+let SubtargetPredicate = HasTanhInsts in {
+defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>;
+}
+
+let SubtargetPredicate = HasBF16TransInsts in {
+defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
+defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>;
+defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
+defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>;
+defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>;
+defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>;
+defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>;
+defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>;
+}
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
defm V_FREXP_EXP_I16_F16 : VOP1Inst_t16_with_profiles <"v_frexp_exp_i16_f16",
@@ -785,6 +803,9 @@ let SubtargetPredicate = isGFX1250Plus in {
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_fp8, V_CVT_F16_FP8_fake16_e64, 1>;
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_bf8, V_CVT_F16_BF8_fake16_e64, 1>;
}
+
+ defm V_SAT_PK4_I4_I8 : VOP1Inst_t16<"v_sat_pk4_i4_i8", VOP1_I16_I32, int_amdgcn_sat_pk4_i4_i8>;
+ defm V_SAT_PK4_U4_U8 : VOP1Inst_t16<"v_sat_pk4_u4_u8", VOP1_I16_I32, int_amdgcn_sat_pk4_u4_u8>;
} // End SubtargetPredicate = isGFX1250Plus
let SubtargetPredicate = isGFX10Plus in {
@@ -1062,6 +1083,13 @@ multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>;
}
+multiclass VOP1_Real_OpSelIsDPP_gfx1250<bits<9> op> : VOP1_Real_e32<GFX1250Gen, op> {
+ defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
+ def _e64_gfx1250 :
+ VOP3_Real_Gen<ps, GFX1250Gen>,
+ VOP3OpSelIsDPP_gfx12<{0, 1, 1, op{6-0}}, ps.Pfl>;
+}
+
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Not12_50Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">;
@@ -1127,11 +1155,25 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;
+defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
+defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
+defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
+defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
+defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
+defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>;
+defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>;
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>;
defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>;
+defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>;
+defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>;
+defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>;
+defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>;
+defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>;
+defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>;
+defm V_COS_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>;
//===----------------------------------------------------------------------===//
// GFX10.
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 25c6cbc3e1ab..030a6e1e978c 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -175,10 +175,14 @@ multiclass VOP2Inst_e64<string opName,
def _e64 : VOP3InstBase <opName, P, node, 1>,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
- let SubtargetPredicate = isGFX11Plus in {
- if P.HasExtVOP3DPP then
- def _e64_dpp : VOP3_DPP_Pseudo <opName, P>;
- } // End SubtargetPredicate = isGFX11Plus
+ if P.HasExtVOP3DPP then
+ def _e64_dpp : VOP3_DPP_Pseudo <opName, P> {
+ let SubtargetPredicate = isGFX11Plus;
+ }
+ else if P.HasExt64BitDPP then
+ def _e64_dpp : VOP3_DPP_Pseudo <opName, P> {
+ let OtherPredicates = [HasDPALU_DPP];
+ }
}
multiclass VOP2Inst_e64_VOPD<string opName,
@@ -1492,7 +1496,9 @@ class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
VOP2_DPP<op, ps, opName, p, 1> {
let AssemblerPredicate = HasDPP16;
let SubtargetPredicate = ps.SubtargetPredicate;
- let OtherPredicates = ps.OtherPredicates;
+ let OtherPredicates = !listconcat(ps.OtherPredicates,
+ !if(p.HasExt64BitDPP, [HasDPALU_DPP], []),
+ !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []));
}
class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, int subtarget,
@@ -1832,6 +1838,9 @@ let SubtargetPredicate = isGFX12Plus in {
V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx12, "v_subrev_co_ci_u32">;
} // End SubtargetPredicate = isGFX12Plus
+let SubtargetPredicate = HasFmacF64Inst in
+defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>;
+
defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>;
defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 75c531913ded..2e7f25b67fb6 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -224,12 +224,6 @@ defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32",
fshr, null_frag>;
defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
-
-// In gfx9 and 10, opsel is allowed for V_ALIGNBIT_B32 and V_ALIGNBYTE_B32.
-// Hardware uses opsel[1:0] to byte-select src2. Other opsel bits are ignored.
-defm V_ALIGNBIT_B32_opsel : VOP3Inst <"v_alignbit_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>;
-defm V_ALIGNBYTE_B32_opsel : VOP3Inst <"v_alignbyte_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>;
-
let True16Predicate = UseRealTrue16Insts in
defm V_ALIGNBYTE_B32_t16 : VOP3Inst <"v_alignbyte_b32_t16", VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>>;
let True16Predicate = UseFakeTrue16Insts in
@@ -1960,9 +1954,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
-defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14e, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">;
-defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14f, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">;
-
defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>;
let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
@@ -2113,8 +2104,8 @@ defm V_BFI_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14a>;
defm V_FMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x14b>;
defm V_FMA_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x14c>;
defm V_LERP_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x14d>;
-defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7<0x14e>;
-defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7<0x14f>;
+defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14e>;
+defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14f>;
defm V_MULLIT_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x150>;
defm V_MIN3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x151>;
defm V_MIN3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x152>;
@@ -2257,17 +2248,6 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0>
}
}
-// Instructions such as v_alignbyte_b32 allows op_sel in gfx9, but not in vi.
-// The following is created to support that.
-multiclass VOP3OpSel_Real_gfx9_with_name<bits<10> op, string opName, string AsmName> {
- defvar psName = opName#"_e64";
- def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(psName), SIEncodingFamily.VI>, // note: encoding family is VI
- VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(psName).Pfl> {
- VOP3_Pseudo ps = !cast<VOP3_Pseudo>(psName);
- let AsmString = AsmName # ps.AsmOperands;
- }
-}
-
} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;
@@ -2287,10 +2267,8 @@ defm V_BFI_B32 : VOP3_Real_vi <0x1ca>;
defm V_FMA_F32 : VOP3_Real_vi <0x1cb>;
defm V_FMA_F64 : VOP3_Real_vi <0x1cc>;
defm V_LERP_U8 : VOP3_Real_vi <0x1cd>;
-let SubtargetPredicate = isGFX8Only in {
defm V_ALIGNBIT_B32 : VOP3_Real_vi <0x1ce>;
defm V_ALIGNBYTE_B32 : VOP3_Real_vi <0x1cf>;
-}
defm V_MIN3_F32 : VOP3_Real_vi <0x1d0>;
defm V_MIN3_I32 : VOP3_Real_vi <0x1d1>;
defm V_MIN3_U32 : VOP3_Real_vi <0x1d2>;
@@ -2335,9 +2313,6 @@ defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16"
defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">;
defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">;
-defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1ce, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">;
-defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1cf, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">;
-
defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">;
defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 31997f803dfc..e51e9574f8de 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1223,6 +1223,8 @@ class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> {
Instruction Opcode2Addr = TwoAddr;
Instruction Opcode3Addr = ThreeAddr;
Predicate WaveSizePredicate;
+ Predicate SubtargetPredicate;
+ field bit is_wmma_xdl;
}
def WMMAOpcode : GenericEnum {
@@ -1315,28 +1317,39 @@ let WaveSizePredicate = isWave64 in {
}
class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
- bit _IsIU, bit _IsFP8BF8>
+ bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0,
+ bit _HasMatrixReuse = 0, bit _IsF4 = 0>
: VOP3P_Profile<VOPProfile<ArgTy>> {
bit IsIU = _IsIU;
- bit IsFP8BF8 = _IsFP8BF8;
- bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8));
+ bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B
+ bit IsXF32 = !and(_IsFP8BF8XF32, !eq(ArgTy[1], v8f32));
int IndexType = _IndexType;
+ let HasMatrixReuse = _HasMatrixReuse;
+ bit HasIModOp = _Has_ImodOp;
+ let HasClamp = !and(IsIU, !not(HasIModOp));
let IsPacked = 1;
let IsWMMA = !not(_IsSWMMAC);
let IsSWMMAC = _IsSWMMAC;
- bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP);
- bit IsAB_BF16 = !and(IsF16BF16, isIntType<ArgTy[1]>.ret);
+ bit IsAB_F64 = !or(!eq(ArgTy[1], v2f64), !eq(ArgTy[1], v4f64));
+ bit IsAB_F32 = !eq(ArgTy[1], v2f32);
+ bit IsAB_F16 = !or(!eq(ArgTy[1], v16f16), !eq(ArgTy[1], v8f16), !eq(ArgTy[1], v4f16));
+ bit IsAB_BF16 = !or(!eq(ArgTy[1], v16i16), !eq(ArgTy[1], v8i16), !eq(ArgTy[1], v4i16),
+ !eq(ArgTy[1], v16bf16), !eq(ArgTy[1], v8bf16), !eq(ArgTy[1], v4bf16));
+ bit IsF16BF16 = !or(IsAB_F16, IsAB_BF16);
+
+ bit IsC_F64 = !eq(ArgTy[3], v8f64);
bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32));
- bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16));
+ bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16),
+ !eq(ArgTy[3], v8bf16), !eq(ArgTy[3], v4bf16));
bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16));
- bit NegLo01 = !or(IsF16BF16, IsIU);
- bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
- bit NegHi01 = IsF16BF16;
- bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
+ bit NegLo01 = !not(NoABMods);
+ bit NegLo2 = !and(!not(IsIU), !not(IsXF32), IsWMMA);
+ bit NegHi01 = IsF16BF16; // Only F16BF16 can have neg_hi[0:1]
+ bit NegHi2 = !and(!not(IsIU), !not(IsXF32), IsWMMA);
bit NegLoAny = !or(NegLo01, NegLo2);
bit NegHiAny = !or(NegHi01, NegHi2);
@@ -1345,19 +1358,29 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
let Src1RC64 = !cast<RegisterOperand>("VRegSrc_"#ArgTy[2].Size);
let Src2RC64 = !if(IsSWMMAC, DstRC,
!cast<RegisterOperand>("VISrc_"#ArgTy[3].Size#
- !cond(IsC_F32: "_f32",
- IsC_F16: "_f16",
+ !cond(IsC_F64: "_f64",
+ IsC_F32: "_f32",
+ IsC_F16: "_f16",
IsC_BF16: "_bf16",
1: "_b32")));
// For f16 and bf16 matrices A and B, each element can be modified by
- // fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is
+ // fneg(neg_lo,neg_hi = 1). For f32 and f64, neg_lo[0:1] is allowed, but
+ // neg_hi[0:1] is ignored. For iu4 and iu8 matrices A and B neg_lo is
// overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext)
- // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each
- // element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1).
+ // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16, f32 and f64 matrix C
+ // each element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1).
// Opcode | src0/src1 - matrix A/B | src2 - matrix C or Index
// ---------------------------------------------------------------------------
+ // wmma f64_f64 | neg_lo for neg A/B | neg_lo = 1 neg C(f64)
+ // | neg_hi ignored | neg_hi = 1 abs C(f64)
+ // ---------------------------------------------------------------------------
+ // wmma f32_f32 | neg_lo for neg A/B | neg_lo = 1 neg C(f32)
+ // | neg_hi ignored | neg_hi = 1 abs C(f32)
+ // ---------------------------------------------------------------------------
+ // wmma f32_xf32 | not allowed for xf32 | not allowed
+ // ---------------------------------------------------------------------------
// wmma f32_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f32)
// wmma f32_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f32)
// ---------------------------------------------------------------------------
@@ -1368,7 +1391,10 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
// | neg_lo = 1 i4/i8(sext) | i32 matrices
// ---------------------------------------------------------------------------
// wmma f32_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f32)
- // (4 instructions) | f8 and bf8 matrices | neg_hi = 1 abs C(f32)
+ // | fp8 and bf8 matrices | neg_hi = 1 abs C(f32)
+ // ---------------------------------------------------------------------------
+ // wmma f16_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f16)
+ // | fp8 and bf8 matrices | neg_hi = 1 abs C(f16)
// ---------------------------------------------------------------------------
// swmmac f32_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
// swmmac f32_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst
@@ -1380,103 +1406,153 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
// | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst
// ---------------------------------------------------------------------------
// swmmac f32_fp8/bf8 | not allowed for | not allowed for sparse matrix
- // (4 instructions) | f8 and bf8 matrices | A Index - matrix C is in dst
+ // swmmac f16_fp8/bf8 | f8 and bf8 matrices | A Index - matrix C is in dst
+ // ---------------------------------------------------------------------------
// pseudo
- // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
+ // fp8bf8 and xf32 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
// use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers,
// remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32
// f16 or bf16). swmmac use index_key and don't use src 2 modifiers.
-
- dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers));
- dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers));
- dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers));
+ dag Src0Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src0_modifiers));
+ dag Src1Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src1_modifiers));
+ dag Src2Mods = !if(!or(IsIU, IsXF32, IsSWMMAC), (ins), (ins PackedF16InputMods:$src2_modifiers));
dag IndexKey = !cond(!eq(IndexType, 0) : (ins),
!eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
- !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit));
- dag Clamp = !if(IsIU, (ins Clamp0:$clamp), (ins));
+ !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit),
+ !eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit));
+
+ dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins));
+ dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins));
dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
!and(NegLoAny, !not(NegHiAny)) : (ins neg_lo0:$neg_lo),
!and(!not(NegLoAny), !not(NegHiAny)) : (ins));
let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1),
!cond(IsWMMA : !con(Src2Mods, (ins Src2RC64:$src2)),
- IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)),
- Clamp, Neg);
+ IsSWMMAC : !con((ins DstRC:$srcTiedDef),
+ !if(!eq(IndexType, 32),
+ (ins VRegSrc_64:$src2),
+ (ins VRegSrc_32:$src2)),
+ IndexKey)),
+ MatrixReuse, Clamp, Neg);
// asm
string IndexKeyAsm = !cond(!eq(IndexType, 0) : "",
!eq(IndexType, 8) : "$index_key_8bit",
- !eq(IndexType, 16) : "$index_key_16bit");
- string ClampAsm = !if(IsIU, "$clamp", "");
+ !eq(IndexType, 16) : "$index_key_16bit",
+ !eq(IndexType, 32) : "$index_key_32bit");
+ string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", "");
+ string ClampAsm = !if(HasClamp, "$clamp", "");
string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi",
!and(NegLoAny, !not(NegHiAny)) : "$neg_lo",
!and(!not(NegLoAny), !not(NegHiAny)) : "");
- let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm;
+ let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixReuseAsm#NegAsm#ClampAsm;
// isel patterns
+ bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp));
+ bit IsAB_F16_IMod0 = !and(IsAB_F16, !not(HasIModOp));
+ bit IsAB_F32F64_IMod1 = !and(!or(IsAB_F64, IsAB_F32), HasIModOp);
+ bit IsAB_F16BF16_IMod1 = !and(!or(IsAB_F16, IsAB_BF16), HasIModOp);
+ dag Src0InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
+ IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src0_modifiers), Src0VT:$src0),
+ IsAB_F16_IMod0 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
+ IsAB_BF16_IMod0 : (ins Src0VT:$src0),
+ IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
+ NoABMods : (ins Src0VT:$src0));
+ dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
+ IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
+ IsAB_F16_IMod0 : (ins i32:$src0_modifiers, Src0VT:$src0),
+ IsAB_BF16_IMod0 : (ins (i32 8), Src0VT:$src0),
+ IsIU : (ins i32:$src0_modifiers, Src0VT:$src0),
+ NoABMods : (ins Src0VT:$src0));
+ dag Src1InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
+ IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src1_modifiers), Src1VT:$src1),
+ IsAB_F16_IMod0 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
+ IsAB_BF16_IMod0 : (ins Src1VT:$src1),
+ IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
+ NoABMods : (ins Src1VT:$src1));
+ dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
+ IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
+ IsAB_F16_IMod0 : (ins i32:$src1_modifiers, Src1VT:$src1),
+ IsAB_BF16_IMod0 : (ins (i32 8), Src1VT:$src1),
+ IsIU : (ins i32:$src1_modifiers, Src1VT:$src1),
+ NoABMods : (ins Src1VT:$src1));
+ bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU), !not(IsXF32));
+ bit IsC_F32_IMod0 = !and(IsC_F32, !not(HasIModOp));
+ bit IsC_F16_IMod0 = !and(IsC_F16, !not(HasIModOp));
+ bit IsC_BF16_IMod0 = !and(IsC_BF16, !not(HasIModOp));
+ bit IsIUXF32 = !or(IsIU, IsXF32);
+ dag Src2InPatWmma = !cond(IsC_IMod1 : (ins (VOP3PModsNegAbs i32:$src2_modifiers), Src2VT:$src2),
+ IsC_F32_IMod0 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
+ IsC_F16_IMod0 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
+ IsC_BF16_IMod0 : (ins Src2VT:$src2),
+ IsIUXF32 : (ins Src2VT:$src2),
+ IsSWMMAC : (ins));
+ dag Src2OutPatWmma = !cond(IsC_IMod1 : (ins i32:$src2_modifiers, Src2VT:$src2),
+ IsC_F32_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2),
+ IsC_F16_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2),
+ IsC_BF16_IMod0 : (ins (i32 8), Src2VT:$src2),
+ IsIUXF32 : (ins Src2VT:$src2),
+ IsSWMMAC : (ins));
+ dag ClampPat = !if(HasClamp, (ins i1:$clamp), (ins));
- dag Src0InPat = !cond(IsAB_F16 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
- IsAB_BF16 : (ins Src0VT:$src0),
- IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
- IsFP8BF8 : (ins Src0VT:$src0));
- dag Src0OutPat = !cond(IsAB_F16 : (ins i32:$src0_modifiers, Src0VT:$src0),
- IsAB_BF16 : (ins (i32 8), Src0VT:$src0),
- IsIU : (ins i32:$src0_modifiers, Src0VT:$src0),
- IsFP8BF8 : (ins Src0VT:$src0));
- dag Src1InPat = !cond(IsAB_F16 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
- IsAB_BF16 : (ins Src1VT:$src1),
- IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
- IsFP8BF8 : (ins Src1VT:$src1));
- dag Src1OutPat = !cond(IsAB_F16 : (ins i32:$src1_modifiers, Src1VT:$src1),
- IsAB_BF16 : (ins (i32 8), Src1VT:$src1),
- IsIU : (ins i32:$src1_modifiers, Src1VT:$src1),
- IsFP8BF8 : (ins Src1VT:$src1));
- dag Src2InPatWmma = !cond(IsC_F32 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
- IsC_F16 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
- IsC_BF16 : (ins Src2VT:$src2),
- IsIU : (ins Src2VT:$src2),
- IsSWMMAC : (ins));
- dag Src2OutPatWmma = !cond(IsC_F32 : (ins i32:$src2_modifiers, Src2VT:$src2),
- IsC_F16 : (ins i32:$src2_modifiers, Src2VT:$src2),
- IsC_BF16 : (ins (i32 8), Src2VT:$src2),
- IsIU : (ins Src2VT:$src2),
- IsSWMMAC : (ins));
- dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins));
dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
!eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))),
- !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))));
+ !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))),
+ !eq(IndexType, 32): (ins (i64 (SWMMACIndex32 i64:$src2, i32:$index_key_32bit))));
dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
!eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit),
- !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit));
- dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2)));
- dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2));
+ !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit),
+ !eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit));
+ dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2))));
+ dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2));
+ dag MatrixReuseInPat = !if(HasMatrixReuse, (ins timm:$matrix_a_reuse, timm:$matrix_b_reuse), (ins));
+ dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins));
- dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat);
- dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat);
+ dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat);
+ dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixReuseOutModPat, ClampPat);
- dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat);
- dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat);
+ dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat);
+ dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat);
// wmma pattern where src2 is inline imm uses _threeaddr pseudo,
// can't use _twoaddr since it would violate src2 tied to vdst constraint.
- dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, ClampPat);
- dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat);
+ dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat);
+ dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixReuseOutModPat, ClampPat);
}
-multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
+def WMMAInstInfoTable : GenericTable {
+ let FilterClass = "WMMAInstInfo";
+ let CppTypeName = "WMMAInstInfo";
+ let Fields = ["Opcode", "is_wmma_xdl"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getWMMAInstInfoHelper";
+}
+
+class WMMAInstInfo {
+ Instruction Opcode = !cast<Instruction>(NAME);
+ bit is_wmma_xdl = 0;
+}
+
+multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix, bit DiffVdstSrc2 = 0> {
+
+ defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2");
+ defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
+
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
- let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in
- def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+ let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
+ def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
let PseudoInstr = Instr#PseudoInstrSuffix;
}
- let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in
- def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+ let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in
+ def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
let PseudoInstr = Instr#PseudoInstrSuffix;
}
@@ -1486,7 +1562,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
}
multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
- def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+ def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
let Mnemonic = Instr;
let PseudoInstr = Instr#PseudoInstrSuffix;
let mayRaiseFPException = 0;
@@ -1556,6 +1632,76 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1,
// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
// for matrix A, index is i16; Matrix B uses all lanes
+def F64_F64X4_WMMA_w32 : VOP3PWMMA_Profile<[v8f64, v2f64, v2f64, v8f64], 0, 0, 0, 0, 1>;
+def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 1>;
+def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
+def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 1>;
+def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 1>;
+def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 1>;
+def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
+def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1>;
+def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1>;
+def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 1>;
+def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 1>;
+def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 1>;
+def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 1>;
+def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 1>;
+def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 1>;
+def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 1>;
+def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 1>;
+def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 1>;
+def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 1>;
+def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 1>;
+
+let WaveSizePredicate = isWave32 in {
+let SubtargetPredicate = isGFX125xOnly in {
+defm V_WMMA_F32_16X16X4_F32_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x4_f32", F32_F32_WMMA_w32, "_w32">;
+
+let is_wmma_xdl = 1 in {
+defm V_WMMA_F32_16X16X32_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x32_bf16", F32_BF16X32_WMMA_w32, "_w32">;
+defm V_WMMA_BF16_16X16X32_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16_16x16x32_bf16", BF16_BF16X32_WMMA_w32, "_w32">;
+defm V_WMMA_BF16F32_16X16X32_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16f32_16x16x32_bf16", BF16F32_BF16_WMMA_w32, "_w32", 1>;
+defm V_WMMA_F32_16X16X64_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_fp8_fp8", F32_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X64_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_fp8_bf8", F32_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X64_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_bf8_fp8", F32_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X64_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_bf8_bf8", F32_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X64_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_fp8_fp8", F16_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X64_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_fp8_bf8", F16_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X64_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_bf8_fp8", F16_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X64_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_bf8_bf8", F16_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X64_IU8_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x64_iu8", I32_IU8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X32_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x32_f16", F32_F16X32_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X32_F16_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x32_f16", F16_F16X32_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X128_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_fp8_fp8", F16_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X128_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_fp8_bf8", F16_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_bf8_fp8", F16_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_bf8_bf8", F16_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X128_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_fp8_fp8", F32_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X128_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_fp8_bf8", F32_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X128_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_bf8_fp8", F32_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X128_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_bf8_bf8", F32_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_f32_32x16x128_f4", F32_32X16X128_F4_WMMA_w32, "_w32">;
+
+defm V_SWMMAC_F32_16X16X64_BF16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_bf16", F32_BF16X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_BF16_16X16X64_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x64_bf16", BF16_BF16X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_BF16F32_16X16X64_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16f32_16x16x64_bf16", F32_BF16X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X128_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_fp8_fp8", F32_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X128_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_fp8_bf8", F32_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X128_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_bf8_fp8", F32_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X128_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_bf8_bf8", F32_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X128_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_fp8_fp8", F16_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X128_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_fp8_bf8", F16_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X128_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_bf8_fp8", F16_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X128_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_bf8_bf8", F16_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X128_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x128_iu8", I32_IU8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_f16", F32_F16X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16", F16_F16X64_SWMMAC_w32, "_w32">;
+
+} // End is_wmma_xdl = 1.
+
+} // End SubtargetPredicate = isGFX125xOnly
+} // End WaveSizePredicate = isWave32
+
let WaveSizePredicate = isWave32 in {
defm V_WMMA_F32_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w32, "_w32">;
defm V_WMMA_F32_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w32, "_w32">;
@@ -1628,7 +1774,7 @@ class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile
let WaveSizePredicate = isWave64;
}
-let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in {
+let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12PlusNot12_50 in {
defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>;
@@ -1655,7 +1801,7 @@ let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in {
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>;
}
-let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in {
+let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in {
defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>;
defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>;
@@ -1681,6 +1827,49 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in {
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w64>;
}
+let WaveSizePredicate = isWave32 in {
+let SubtargetPredicate = isGFX125xOnly in {
+ defm : WMMAPat<"V_WMMA_F32_16X16X4_F32_w32", int_amdgcn_wmma_f32_16x16x4_f32, F32_F32_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X32_BF16_w32", int_amdgcn_wmma_f32_16x16x32_bf16, F32_BF16X32_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_BF16_16X16X32_BF16_w32", int_amdgcn_wmma_bf16_16x16x32_bf16, BF16_BF16X32_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_BF16F32_16X16X32_BF16_w32", int_amdgcn_wmma_bf16f32_16x16x32_bf16, BF16F32_BF16_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X64_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x64_fp8_fp8, F32_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X64_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x64_fp8_bf8, F32_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X64_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x64_bf8_fp8, F32_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X64_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x64_bf8_bf8, F32_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X64_FP8_FP8_w32", int_amdgcn_wmma_f16_16x16x64_fp8_fp8, F16_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X64_FP8_BF8_w32", int_amdgcn_wmma_f16_16x16x64_fp8_bf8, F16_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X64_BF8_FP8_w32", int_amdgcn_wmma_f16_16x16x64_bf8_fp8, F16_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X64_BF8_BF8_w32", int_amdgcn_wmma_f16_16x16x64_bf8_bf8, F16_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_I32_16X16X64_IU8_w32", int_amdgcn_wmma_i32_16x16x64_iu8, I32_IU8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X32_F16_w32", int_amdgcn_wmma_f32_16x16x32_f16, F32_F16X32_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X32_F16_w32", int_amdgcn_wmma_f16_16x16x32_f16, F16_F16X32_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X128_FP8_FP8_w32", int_amdgcn_wmma_f16_16x16x128_fp8_fp8, F16_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X128_FP8_BF8_w32", int_amdgcn_wmma_f16_16x16x128_fp8_bf8, F16_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f16_16x16x128_bf8_fp8, F16_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f16_16x16x128_bf8_bf8, F16_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X128_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_fp8_fp8, F32_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X128_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_fp8_bf8, F32_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>;
+
+ def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16_16x16x64_bf16, BF16_BF16X64_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_fp8_fp8, F32_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_fp8_bf8, F32_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_fp8_fp8, F16_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_fp8_bf8, F16_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_bf8_fp8, F16_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_bf8_bf8, F16_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr, int_amdgcn_swmmac_i32_16x16x128_iu8, I32_IU8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X64_F16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_f16, F32_F16X64_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F16_16X16X64_F16_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x64_f16, F16_F16X64_SWMMAC_w32>;
+} // End SubtargetPredicate = isGFX125xOnly
+} // End WaveSizePredicate = isWave32
//===----------------------------------------------------------------------===//
// Begin Real Encodings
@@ -1726,13 +1915,14 @@ class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
// opsel
let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0,
!eq(WMMAP.IndexType, 8) : index_key_8bit{0},
- !eq(WMMAP.IndexType, 16) : index_key_16bit{0});
+ !eq(WMMAP.IndexType, 16) : index_key_16bit{0},
+ !eq(WMMAP.IndexType, 32) : index_key_32bit{0});
let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0);
- let Inst{13} = 0;
+ let Inst{13} = !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0);
// opsel_hi
let Inst{59} = 1;
let Inst{60} = 1;
- let Inst{14} = 1;
+ let Inst{14} = !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1);
// neg_lo
let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
@@ -1742,7 +1932,7 @@ class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
let Inst{9} = !if(WMMAP.NegHi01, src1_modifiers{1}, 0);
let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0);
// clamp
- let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0);
+ let Inst{15} = !if(WMMAP.HasClamp, clamp{0}, 0);
}
multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<8> op, VOP3PWMMA_Profile WMMAP,
@@ -1765,6 +1955,12 @@ multiclass VOP3P_Real_WMMA_gfx12w64 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
}
}
+multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
+ let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
+ defm _twoaddr : VOP3P_WMMA_Real_Base <GFX1250Gen, op, WMMAP>;
+ }
+}
+
defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
@@ -1814,6 +2010,46 @@ defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP
defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
+defm V_WMMA_F32_16X16X4_F32_w32 : VOP3P_Real_WMMA_gfx1250 <0x05d, F32_F32_WMMA_w32>;
+defm V_WMMA_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x062, F32_BF16X32_WMMA_w32>;
+defm V_WMMA_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x060, F32_F16X32_WMMA_w32>;
+defm V_WMMA_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x061, F16_F16X32_WMMA_w32>;
+defm V_WMMA_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x063, BF16_BF16X32_WMMA_w32>;
+defm V_WMMA_BF16F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x064, BF16F32_BF16_WMMA_w32>;
+defm V_WMMA_F32_16X16X64_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06a, F32_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F32_16X16X64_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06b, F32_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F32_16X16X64_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06c, F32_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F32_16X16X64_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06d, F32_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F16_16X16X64_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06e, F16_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F16_16X16X64_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06f, F16_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F16_16X16X64_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x070, F16_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F16_16X16X64_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x071, F16_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_I32_16X16X64_IU8_w32 : VOP3P_Real_WMMA_gfx1250 <0x072, I32_IU8X64_WMMA_w32>;
+defm V_WMMA_F32_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x080, F32_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F32_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x081, F32_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F32_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x082, F32_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F32_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x083, F32_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F16_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x084, F16_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F16_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x085, F16_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x086, F16_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>;
+
+defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>;
+defm V_SWMMAC_BF16_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x068, BF16_BF16X64_SWMMAC_w32>;
+defm V_SWMMAC_BF16F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x069, F32_BF16X64_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x073, F32_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x074, F32_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x075, F32_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x076, F32_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x077, F16_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x078, F16_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x079, F16_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x07a, F16_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X128_IU8_w32 : VOP3P_Real_WMMA_gfx1250 <0x07b, I32_IU8X128_SWMMAC_w32>;
+
multiclass VOP3P_Real_with_name<GFXGen Gen, bits<8> op,
string backing_ps_name = NAME,
string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index df215d23f7f4..a25ebdf3e5f6 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -331,10 +331,19 @@ class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
// Special case for v_permlane16_swap_b32/v_permlane32_swap_b32
// op_sel[0]/op_sel[1] are treated as bound_ctrl and fi dpp operands.
-class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
+class VOP3OpSelIsDPP_base {
bits<1> fi;
bits<1> bound_ctrl;
+}
+
+class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_vi <op, P> {
+ // OPSEL[0] specifies FI
+ let Inst{11} = fi;
+ // OPSEL[1] specifies BOUND_CTRL
+ let Inst{12} = bound_ctrl;
+}
+class VOP3OpSelIsDPP_gfx12 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_gfx11_gfx12 <op, P> {
// OPSEL[0] specifies FI
let Inst{11} = fi;
// OPSEL[1] specifies BOUND_CTRL
@@ -432,7 +441,7 @@ class VOP3be <VOPProfile P> : Enc64 {
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
}
-class VOP3Pe <VOPProfile P> : Enc64 {
+class VOP3Pe_Base {
bits<8> vdst;
bits<4> src0_modifiers;
bits<9> src0;
@@ -443,7 +452,12 @@ class VOP3Pe <VOPProfile P> : Enc64 {
bits<1> clamp;
bits<2> index_key_8bit;
bits<1> index_key_16bit;
+ bits<1> index_key_32bit;
+ bits<1> matrix_a_reuse;
+ bits<1> matrix_b_reuse;
+}
+class VOP3Pe <VOPProfile P> : Enc64, VOP3Pe_Base {
let Inst{7-0} = !if(P.HasDst, vdst, 0);
let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
@@ -451,9 +465,13 @@ class VOP3Pe <VOPProfile P> : Enc64 {
let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
- let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
+ let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2},
+ !if(P.HasMatrixReuse, matrix_a_reuse, 0)); // op_sel(2)
- let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(2)
+ let Inst{14} = !cond(!and(P.HasSrc2, P.HasOpSel) : src2_modifiers{3},
+ P.IsDOT : 1,
+ P.HasMatrixReuse : matrix_b_reuse,
+ 1: ?); // op_sel_hi(2)
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);