summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td91
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp267
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp71
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h15
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp39
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td39
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp162
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp294
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp151
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp274
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp221
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td287
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td61
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp59
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h7
-rw-r--r--llvm/lib/Target/AMDGPU/EvergreenInstructions.td42
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td227
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp129
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h10
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h29
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp96
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h58
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp54
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h15
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/R600Packetizer.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp668
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp37
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td56
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td41
-rw-r--r--llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp68
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.cpp39
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td17
-rw-r--r--llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp58
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp66
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td17
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h11
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp41
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h14
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.h39
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp113
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h24
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp46
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/SIDefinesUtils.h79
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td37
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td20
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td51
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td14
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td12
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td27
98 files changed, 2894 insertions, 1893 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index d0d7a9dc1724..63d83346528a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -351,6 +351,7 @@ def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts",
"GFX90AInsts",
"true",
"Additional instructions for GFX90A+"
+ // [HasAtomicFMinFMaxF64GlobalInsts, HasAtomicFMinFMaxF64FlatInsts] // TODO
>;
def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts",
@@ -711,6 +712,30 @@ def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts",
[FeatureFlatGlobalInsts]
>;
+def FeatureAtomicFMinFMaxF32GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f32",
+ "HasAtomicFMinFMaxF32GlobalInsts",
+ "true",
+ "Has global/buffer instructions for atomicrmw fmin/fmax for float"
+>;
+
+def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f64",
+ "HasAtomicFMinFMaxF64GlobalInsts",
+ "true",
+ "Has global/buffer instructions for atomicrmw fmin/fmax for float"
+>;
+
+def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32",
+ "HasAtomicFMinFMaxF32FlatInsts",
+ "true",
+ "Has flat memory instructions for atomicrmw fmin/fmax for float"
+>;
+
+def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64",
+ "HasAtomicFMinFMaxF64FlatInsts",
+ "true",
+ "Has flat memory instructions for atomicrmw fmin/fmax for double"
+>;
+
def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts",
"HasAtomicFaddNoRtnInsts",
"true",
@@ -743,6 +768,12 @@ def FeatureAtomicGlobalPkAddBF16Inst : SubtargetFeature<"atomic-global-pk-add-bf
[FeatureFlatGlobalInsts]
>;
+def FeatureAtomicBufferPkAddBF16Inst : SubtargetFeature<"atomic-buffer-pk-add-bf16-inst",
+ "HasAtomicBufferPkAddBF16Inst",
+ "true",
+ "Has buffer_atomic_pk_add_bf16 instruction"
+>;
+
def FeatureAtomicCSubNoRtnInsts : SubtargetFeature<"atomic-csub-no-rtn-insts",
"HasAtomicCSubNoRtnInsts",
"true",
@@ -1061,7 +1092,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
- FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
+ FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts
]
>;
@@ -1072,7 +1104,9 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
- FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
+ FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
]
>;
@@ -1127,7 +1161,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
- FeatureMaxHardClauseLength63
+ FeatureMaxHardClauseLength63,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
]
>;
@@ -1148,7 +1184,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
FeatureGWS, FeatureDefaultComponentZero,
- FeatureMaxHardClauseLength32
+ FeatureMaxHardClauseLength32,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
]
>;
@@ -1169,7 +1206,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
- FeatureMaxHardClauseLength32
+ FeatureMaxHardClauseLength32,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
]
>;
@@ -1332,7 +1370,10 @@ def FeatureISAVersion9_0_A : FeatureSet<
FeaturePackedTID,
FullRate64Ops,
FeatureBackOffBarrier,
- FeatureKernargPreload])>;
+ FeatureKernargPreload,
+ FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureAtomicFMinFMaxF64FlatInsts
+ ])>;
def FeatureISAVersion9_0_C : FeatureSet<
!listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
@@ -1372,7 +1413,10 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureArchitectedFlatScratch,
FullRate64Ops,
FeatureBackOffBarrier,
- FeatureKernargPreload]>;
+ FeatureKernargPreload,
+ FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureAtomicFMinFMaxF64FlatInsts
+ ]>;
def FeatureISAVersion9_4_0 : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
@@ -1561,6 +1605,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureAtomicFlatPkAdd16Insts,
FeatureAtomicBufferGlobalPkAddF16Insts,
FeatureAtomicGlobalPkAddBF16Inst,
+ FeatureAtomicBufferPkAddBF16Inst,
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeatureExtendedImageInsts,
@@ -1572,7 +1617,9 @@ def FeatureISAVersion12 : FeatureSet<
FeatureHasRestrictedSOffset,
FeatureVGPRSingleUseHintInsts,
FeatureScalarDwordx3Loads,
- FeatureDPPSrc1SGPR]>;
+ FeatureDPPSrc1SGPR,
+ FeatureMaxHardClauseLength32,
+ Feature1_5xVGPRs]>;
def FeatureISAVersion12_Generic: FeatureSet<
!listconcat(FeatureISAVersion12.Features,
@@ -1862,9 +1909,28 @@ def isGFX12Plus :
def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
-def HasBufferFlatGlobalAtomicsF64 :
+
+def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd
Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
- AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
+ // FIXME: This is too coarse, and working around using pseudo's predicates on real instruction.
+ AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, FeatureSouthernIslands, FeatureSeaIslands)>;
+
+def HasAtomicFMinFMaxF32GlobalInsts :
+ Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
+ AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32GlobalInsts)>;
+
+def HasAtomicFMinFMaxF64GlobalInsts :
+ Predicate<"Subtarget->hasAtomicFMinFMaxF64GlobalInsts()">,
+ AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64GlobalInsts)>;
+
+def HasAtomicFMinFMaxF32FlatInsts :
+ Predicate<"Subtarget->hasAtomicFMinFMaxF32FlatInsts()">,
+ AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32FlatInsts)>;
+
+def HasAtomicFMinFMaxF64FlatInsts :
+ Predicate<"Subtarget->hasAtomicFMinFMaxF64FlatInsts()">,
+ AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64FlatInsts)>;
+
def HasLdsAtomicAddF64 :
Predicate<"Subtarget->hasLdsAtomicAddF64()">,
AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
@@ -2118,7 +2184,10 @@ def HasAtomicBufferGlobalPkAddF16Insts
AssemblerPredicate<(all_of FeatureAtomicBufferGlobalPkAddF16Insts)>;
def HasAtomicGlobalPkAddBF16Inst
: Predicate<"Subtarget->hasAtomicGlobalPkAddBF16Inst()">,
- AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>;
+ AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>;
+def HasAtomicBufferPkAddBF16Inst
+ : Predicate<"Subtarget->hasAtomicBufferPkAddBF16Inst()">,
+ AssemblerPredicate<(all_of FeatureAtomicBufferPkAddBF16Inst)>;
def HasFlatAtomicFaddF32Inst
: Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">,
AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index de25f9241a50..f57fc168c1df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -115,6 +115,9 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
return std::tuple(
PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr,
&AMDGPU::SGPR_32RegClass, LLT::scalar(32));
+ case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_SIZE:
+ return {PrivateSegmentSize ? &PrivateSegmentSize : nullptr,
+ &AMDGPU::SGPR_32RegClass, LLT::scalar(32)};
case AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR:
return std::tuple(KernargSegmentPtr ? &KernargSegmentPtr : nullptr,
&AMDGPU::SGPR_64RegClass,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 42b33c50d9f8..2e02bb4271ad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -114,11 +114,12 @@ struct AMDGPUFunctionArgInfo {
PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
IMPLICIT_BUFFER_PTR = 15,
IMPLICIT_ARG_PTR = 16,
+ PRIVATE_SEGMENT_SIZE = 17,
// VGPRS:
- WORKITEM_ID_X = 17,
- WORKITEM_ID_Y = 18,
- WORKITEM_ID_Z = 19,
+ WORKITEM_ID_X = 18,
+ WORKITEM_ID_Y = 19,
+ WORKITEM_ID_Z = 20,
FIRST_VGPR_VALUE = WORKITEM_ID_X
};
// clang-format on
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index cad4a3430327..e49925f86bd9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -29,6 +29,7 @@
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDKernelCodeTUtils.h"
+#include "Utils/SIDefinesUtils.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -135,15 +136,6 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
getTargetStreamer()->getPALMetadata()->readFromIR(M);
}
-uint64_t AMDGPUAsmPrinter::getMCExprValue(const MCExpr *Value, MCContext &Ctx) {
- int64_t Val;
- if (!Value->evaluateAsAbsolute(Val)) {
- Ctx.reportError(SMLoc(), "could not resolve expression when required.");
- return 0;
- }
- return static_cast<uint64_t>(Val);
-}
-
void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
// Init target streamer if it has not yet happened
if (!IsTargetStreamerInitialized)
@@ -248,14 +240,14 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
getNameWithPrefix(KernelName, &MF->getFunction());
getTargetStreamer()->EmitAmdhsaKernelDescriptor(
STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
- getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Context),
- getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Context) -
- IsaInfo::getNumExtraSGPRs(
- &STM, getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
- getMCExprValue(CurrentProgramInfo.FlatUsed, Context),
- getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
- getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
- getMCExprValue(CurrentProgramInfo.FlatUsed, Context));
+ CurrentProgramInfo.NumVGPRsForWavesPerEU,
+ MCBinaryExpr::createSub(
+ CurrentProgramInfo.NumSGPRsForWavesPerEU,
+ AMDGPUMCExpr::createExtraSGPRs(
+ CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
+ getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
+ Context),
+ CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
Streamer.popSection();
}
@@ -400,9 +392,40 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments(
false);
}
-uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
+SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
+ SmallString<128> Str;
+ raw_svector_ostream OSS(Str);
+ int64_t IVal;
+ if (Value->evaluateAsAbsolute(IVal)) {
+ OSS << static_cast<uint64_t>(IVal);
+ } else {
+ Value->print(OSS, MAI);
+ }
+ return Str;
+}
+
+void AMDGPUAsmPrinter::emitCommonFunctionComments(
+ const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
+ const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
+ const AMDGPUMachineFunction *MFI) {
+ OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
+ OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false);
+ OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
+ if (NumAGPR && TotalNumVGPR) {
+ OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
+ OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
+ false);
+ }
+ OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
+ false);
+ OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
+ false);
+}
+
+const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
const MachineFunction &MF) const {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ MCContext &Ctx = MF.getContext();
uint16_t KernelCodeProperties = 0;
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
@@ -430,16 +453,28 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
}
+ if (UserSGPRInfo.hasPrivateSegmentSize()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
+ }
if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
}
- if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, MF.getContext()) &&
- CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
- KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
-
- return KernelCodeProperties;
+ // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
+ // un-evaluatable at this point so it cannot be conditionally checked here.
+ // Instead, we'll directly shift the possibly unknown MCExpr into its place
+ // and bitwise-or it into KernelCodeProperties.
+ const MCExpr *KernelCodePropExpr =
+ MCConstantExpr::create(KernelCodeProperties, Ctx);
+ const MCExpr *OrValue = MCConstantExpr::create(
+ amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
+ OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
+ OrValue, Ctx);
+ KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
+
+ return KernelCodePropExpr;
}
MCKernelDescriptor
@@ -462,11 +497,15 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
- KernelDescriptor.kernel_code_properties =
- MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx);
-
- assert(STM.hasGFX90AInsts() ||
- getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
+ KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
+
+ int64_t PGRM_Rsrc3 = 1;
+ bool EvaluatableRsrc3 =
+ CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
+ (void)PGRM_Rsrc3;
+ (void)EvaluatableRsrc3;
+ assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
+ static_cast<uint64_t>(PGRM_Rsrc3) == 0);
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
KernelDescriptor.kernarg_preload = MCConstantExpr::create(
@@ -554,13 +593,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->emitRawComment(" Kernel info:", false);
emitCommonFunctionComments(
- getMCExprValue(CurrentProgramInfo.NumArchVGPR, Ctx),
- STM.hasMAIInsts() ? getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx)
- : std::optional<uint32_t>(),
- getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx),
- getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx),
- getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx),
- getFunctionCodeSize(MF), MFI);
+ CurrentProgramInfo.NumArchVGPR,
+ STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
+ CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
+ CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
OutStreamer->emitRawComment(
" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
@@ -571,43 +607,38 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
" bytes/workgroup (compile time only)", false);
OutStreamer->emitRawComment(
- " SGPRBlocks: " +
- Twine(getMCExprValue(CurrentProgramInfo.SGPRBlocks, Ctx)),
- false);
+ " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
+
OutStreamer->emitRawComment(
- " VGPRBlocks: " +
- Twine(getMCExprValue(CurrentProgramInfo.VGPRBlocks, Ctx)),
- false);
+ " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
OutStreamer->emitRawComment(
" NumSGPRsForWavesPerEU: " +
- Twine(
- getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)),
+ getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
false);
OutStreamer->emitRawComment(
" NumVGPRsForWavesPerEU: " +
- Twine(
- getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)),
+ getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
false);
- if (STM.hasGFX90AInsts())
+ if (STM.hasGFX90AInsts()) {
+ const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
+ CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
+ AdjustedAccum = MCBinaryExpr::createMul(
+ AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
OutStreamer->emitRawComment(
- " AccumOffset: " +
- Twine((getMCExprValue(CurrentProgramInfo.AccumOffset, Ctx) + 1) *
- 4),
- false);
+ " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
+ }
OutStreamer->emitRawComment(
- " Occupancy: " +
- Twine(getMCExprValue(CurrentProgramInfo.Occupancy, Ctx)),
- false);
+ " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
OutStreamer->emitRawComment(
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
- Twine(getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx)),
+ getMCExprStr(CurrentProgramInfo.ScratchEnable),
false);
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
Twine(CurrentProgramInfo.UserSGPR),
@@ -628,20 +659,25 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
Twine(CurrentProgramInfo.TIdIGCompCount),
false);
+ [[maybe_unused]] int64_t PGMRSrc3;
assert(STM.hasGFX90AInsts() ||
- getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
+ (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
+ PGMRSrc3) &&
+ static_cast<uint64_t>(PGMRSrc3) == 0));
if (STM.hasGFX90AInsts()) {
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
- Twine((AMDHSA_BITS_GET(
- getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx),
- amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
+ getMCExprStr(MCKernelDescriptor::bits_get(
+ CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
false);
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
- Twine((AMDHSA_BITS_GET(
- getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx),
- amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
+ getMCExprStr(MCKernelDescriptor::bits_get(
+ CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
false);
}
}
@@ -765,7 +801,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// The calculations related to SGPR/VGPR blocks are
// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
// unified.
- const MCExpr *ExtraSGPRs = AMDGPUVariadicMCExpr::createExtraSGPRs(
+ const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
ProgInfo.VCCUsed, ProgInfo.FlatUsed,
getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
@@ -858,27 +894,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
}
}
}
- ProgInfo.NumSGPR = AMDGPUVariadicMCExpr::createMax(
+ ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
{ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
- ProgInfo.NumArchVGPR = AMDGPUVariadicMCExpr::createMax(
+ ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
{ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
- ProgInfo.NumVGPR = AMDGPUVariadicMCExpr::createTotalNumVGPR(
+ ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
}
// Adjust number of registers used to meet default/requested minimum/maximum
// number of waves per execution unit request.
unsigned MaxWaves = MFI->getMaxWavesPerEU();
- ProgInfo.NumSGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
- {ProgInfo.NumSGPR, CreateExpr(1ul),
- CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
- Ctx);
- ProgInfo.NumVGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
- {ProgInfo.NumVGPR, CreateExpr(1ul),
- CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
- Ctx);
+ ProgInfo.NumSGPRsForWavesPerEU =
+ AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
+ CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
+ Ctx);
+ ProgInfo.NumVGPRsForWavesPerEU =
+ AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
+ CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
+ Ctx);
if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
STM.hasSGPRInitBug()) {
@@ -927,10 +963,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
unsigned Granule) {
const MCExpr *OneConst = CreateExpr(1ul);
const MCExpr *GranuleConst = CreateExpr(Granule);
- const MCExpr *MaxNumGPR =
- AMDGPUVariadicMCExpr::createMax({NumGPR, OneConst}, Ctx);
+ const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
const MCExpr *AlignToGPR =
- AMDGPUVariadicMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
+ AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
const MCExpr *DivGPR =
MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
@@ -972,7 +1007,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// The MCExpr equivalent of divideCeil.
auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
const MCExpr *Ceil =
- AMDGPUVariadicMCExpr::createAlignTo(Numerator, Denominator, Ctx);
+ AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
};
@@ -1045,7 +1080,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
}
- ProgInfo.Occupancy = AMDGPUVariadicMCExpr::createOccupancy(
+ ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
@@ -1207,41 +1242,49 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
auto &Ctx = MF.getContext();
MD->setEntryPoint(CC, MF.getFunction().getName());
- MD->setNumUsedVgprs(
- CC, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx));
+ MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
// Only set AGPRs for supported devices
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
if (STM.hasMAIInsts()) {
- MD->setNumUsedAgprs(CC, getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx));
+ MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
}
- MD->setNumUsedSgprs(
- CC, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx));
+ MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
if (MD->getPALMajorVersion() < 3) {
- MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM));
+ MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
if (AMDGPU::isCompute(CC)) {
- MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
+ MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
} else {
- if (getMCExprValue(CurrentProgramInfo.ScratchBlocks, Ctx) > 0)
- MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
+ const MCExpr *HasScratchBlocks =
+ MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
+ MCConstantExpr::create(0, Ctx), Ctx);
+ auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
+ MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
}
} else {
MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
- MD->setHwStage(CC, ".scratch_en",
- (bool)getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx));
+ MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
+ CurrentProgramInfo.ScratchEnable);
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
}
// ScratchSize is in bytes, 16 aligned.
MD->setScratchSize(
- CC, alignTo(getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx), 16));
+ CC,
+ AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
+ MCConstantExpr::create(16, Ctx), Ctx),
+ Ctx);
+
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
: CurrentProgramInfo.LDSBlocks;
if (MD->getPALMajorVersion() < 3) {
- MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
+ MD->setRsrc2(
+ CC,
+ MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx),
+ Ctx);
MD->setSpiPsInputEna(MFI->getPSInputEnable());
MD->setSpiPsInputAddr(MFI->getPSInputAddr());
} else {
@@ -1288,20 +1331,19 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
if (MD->getPALMajorVersion() < 3) {
// Set compute registers
- MD->setRsrc1(CallingConv::AMDGPU_CS,
- CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST));
+ MD->setRsrc1(
+ CallingConv::AMDGPU_CS,
+ CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
MD->setRsrc2(CallingConv::AMDGPU_CS,
- CurrentProgramInfo.getComputePGMRSrc2());
+ CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
} else {
EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
}
// Set optional info
MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
- MD->setFunctionNumUsedVgprs(
- FnName, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx));
- MD->setFunctionNumUsedSgprs(
- FnName, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx));
+ MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+ MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
}
// This is supposed to be log2(Size)
@@ -1362,6 +1404,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
if (UserSGPRInfo.hasFlatScratchInit())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+ if (UserSGPRInfo.hasPrivateSegmentSize())
+ Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
+
if (UserSGPRInfo.hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
@@ -1463,28 +1508,26 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
// remarks to simulate newlines. If and when clang does accept newlines, this
// formatting should be aggregated into one remark with newlines to avoid
// printing multiple diagnostic location and diag opts.
- MCContext &MCCtx = MF.getContext();
EmitResourceUsageRemark("FunctionName", "Function Name",
MF.getFunction().getName());
EmitResourceUsageRemark("NumSGPR", "SGPRs",
- getMCExprValue(CurrentProgramInfo.NumSGPR, MCCtx));
- EmitResourceUsageRemark(
- "NumVGPR", "VGPRs",
- getMCExprValue(CurrentProgramInfo.NumArchVGPR, MCCtx));
+ getMCExprStr(CurrentProgramInfo.NumSGPR));
+ EmitResourceUsageRemark("NumVGPR", "VGPRs",
+ getMCExprStr(CurrentProgramInfo.NumArchVGPR));
if (hasMAIInsts) {
- EmitResourceUsageRemark(
- "NumAGPR", "AGPRs",
- getMCExprValue(CurrentProgramInfo.NumAccVGPR, MCCtx));
+ EmitResourceUsageRemark("NumAGPR", "AGPRs",
+ getMCExprStr(CurrentProgramInfo.NumAccVGPR));
}
- EmitResourceUsageRemark(
- "ScratchSize", "ScratchSize [bytes/lane]",
- getMCExprValue(CurrentProgramInfo.ScratchSize, MCCtx));
+ EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
+ getMCExprStr(CurrentProgramInfo.ScratchSize));
+ int64_t DynStack;
+ bool DynStackEvaluatable =
+ CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
StringRef DynamicStackStr =
- getMCExprValue(CurrentProgramInfo.DynamicCallStack, MCCtx) ? "True"
- : "False";
+ DynStackEvaluatable && DynStack ? "True" : "False";
EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
- getMCExprValue(CurrentProgramInfo.Occupancy, MCCtx));
+ getMCExprStr(CurrentProgramInfo.Occupancy));
EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
CurrentProgramInfo.SGPRSpill);
EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 87156f27fc6c..f70a60aef007 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -65,12 +65,16 @@ private:
uint32_t TotalNumVGPR, uint32_t NumSGPR,
uint64_t ScratchSize, uint64_t CodeSize,
const AMDGPUMachineFunction *MFI);
+ void emitCommonFunctionComments(const MCExpr *NumVGPR, const MCExpr *NumAGPR,
+ const MCExpr *TotalNumVGPR,
+ const MCExpr *NumSGPR,
+ const MCExpr *ScratchSize, uint64_t CodeSize,
+ const AMDGPUMachineFunction *MFI);
void emitResourceUsageRemarks(const MachineFunction &MF,
const SIProgramInfo &CurrentProgramInfo,
bool isModuleEntryFunction, bool hasMAIInsts);
- uint16_t getAmdhsaKernelCodeProperties(
- const MachineFunction &MF) const;
+ const MCExpr *getAmdhsaKernelCodeProperties(const MachineFunction &MF) const;
AMDGPU::MCKernelDescriptor
getAmdhsaKernelDescriptor(const MachineFunction &MF,
@@ -78,7 +82,7 @@ private:
void initTargetStreamer(Module &M);
- static uint64_t getMCExprValue(const MCExpr *Value, MCContext &Ctx);
+ SmallString<128> getMCExprStr(const MCExpr *Value);
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 1d645002b1fe..d7ef6f3c5dc4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -249,63 +249,54 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
switch (I.getIntrinsicID()) {
default:
return;
- case Intrinsic::amdgcn_buffer_atomic_add:
case Intrinsic::amdgcn_struct_buffer_atomic_add:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
case Intrinsic::amdgcn_raw_buffer_atomic_add:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
Op = AtomicRMWInst::Add;
break;
- case Intrinsic::amdgcn_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
Op = AtomicRMWInst::Sub;
break;
- case Intrinsic::amdgcn_buffer_atomic_and:
case Intrinsic::amdgcn_struct_buffer_atomic_and:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
case Intrinsic::amdgcn_raw_buffer_atomic_and:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
Op = AtomicRMWInst::And;
break;
- case Intrinsic::amdgcn_buffer_atomic_or:
case Intrinsic::amdgcn_struct_buffer_atomic_or:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
case Intrinsic::amdgcn_raw_buffer_atomic_or:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
Op = AtomicRMWInst::Or;
break;
- case Intrinsic::amdgcn_buffer_atomic_xor:
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
Op = AtomicRMWInst::Xor;
break;
- case Intrinsic::amdgcn_buffer_atomic_smin:
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
Op = AtomicRMWInst::Min;
break;
- case Intrinsic::amdgcn_buffer_atomic_umin:
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
Op = AtomicRMWInst::UMin;
break;
- case Intrinsic::amdgcn_buffer_atomic_smax:
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
Op = AtomicRMWInst::Max;
break;
- case Intrinsic::amdgcn_buffer_atomic_umax:
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
@@ -413,7 +404,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
assert(ST->hasPermLaneX16());
V = B.CreateBitCast(V, IntNTy);
Value *Permlanex16Call = B.CreateIntrinsic(
- Intrinsic::amdgcn_permlanex16, {},
+ V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
B.CreateBitCast(Permlanex16Call, AtomicTy));
@@ -425,7 +416,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Reduce across the upper and lower 32 lanes.
V = B.CreateBitCast(V, IntNTy);
Value *Permlane64Call =
- B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V);
+ B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
B.CreateBitCast(Permlane64Call, AtomicTy));
}
@@ -433,7 +424,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
// combine them with a scalar operation.
Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
V = B.CreateBitCast(V, IntNTy);
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
@@ -481,7 +472,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
assert(ST->hasPermLaneX16());
V = B.CreateBitCast(V, IntNTy);
Value *PermX = B.CreateIntrinsic(
- Intrinsic::amdgcn_permlanex16, {},
+ V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
Value *UpdateDPPCall =
@@ -523,10 +514,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
B.getInt32(0xf), B.getFalse()});
} else {
- Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
- Function *WriteLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+ Function *ReadLane = Intrinsic::getDeclaration(
+ M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
+ Function *WriteLane = Intrinsic::getDeclaration(
+ M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
// On GFX10 all DPP operations are confined to a single row. To get cross-
// row operations we have to use permlane or readlane.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 231db188e65d..537d3a43aa9f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -271,11 +271,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
// FIXME: Check MMO is atomic
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, SIatomic_fmin>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, SIatomic_fmax>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, atomic_load_fmin_glue>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, atomic_load_fmax_glue>;
-
+def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
+def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SWAP, SIbuffer_atomic_swap>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_ADD, SIbuffer_atomic_add>;
@@ -290,7 +287,6 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
-def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD_BF16, SIbuffer_atomic_fadd_bf16>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index a0c6bf7cc31c..fb258547e8fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -46,8 +46,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
- AU.addRequired<MachinePostDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
AU.addRequired<MachineUniformityAnalysisPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -192,8 +192,8 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
"AMDGPU GlobalISel divergence lowering", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
"AMDGPU GlobalISel divergence lowering", false, false)
@@ -209,8 +209,10 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
MachineFunction &MF) {
- MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
- MachinePostDominatorTree &PDT = getAnalysis<MachinePostDominatorTree>();
+ MachineDominatorTree &DT =
+ getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+ MachinePostDominatorTree &PDT =
+ getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
MachineUniformityInfo &MUI =
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 7ab9ba285133..efe47b2c3eed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -464,16 +464,6 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
const Function &F = MF.getFunction();
- auto GetMCExprValue = [&MF](const MCExpr *Value) {
- int64_t Val;
- if (!Value->evaluateAsAbsolute(Val)) {
- MCContext &Ctx = MF.getContext();
- Ctx.reportError(SMLoc(), "could not resolve expression when required.");
- Val = 0;
- }
- return static_cast<uint64_t>(Val);
- };
-
auto Kern = HSAMetadataDoc->getMapNode();
Align MaxKernArgAlign;
@@ -481,11 +471,12 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
STM.getKernArgSegmentSize(F, MaxKernArgAlign));
Kern[".group_segment_fixed_size"] =
Kern.getDocument()->getNode(ProgramInfo.LDSSize);
- Kern[".private_segment_fixed_size"] =
- Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.ScratchSize));
+ DelayedExprs->assignDocNode(Kern[".private_segment_fixed_size"],
+ msgpack::Type::UInt, ProgramInfo.ScratchSize);
if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5) {
- Kern[".uses_dynamic_stack"] = Kern.getDocument()->getNode(
- static_cast<bool>(GetMCExprValue(ProgramInfo.DynamicCallStack)));
+ DelayedExprs->assignDocNode(Kern[".uses_dynamic_stack"],
+ msgpack::Type::Boolean,
+ ProgramInfo.DynamicCallStack);
}
if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5 && STM.supportsWGP())
@@ -497,15 +488,15 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value());
Kern[".wavefront_size"] =
Kern.getDocument()->getNode(STM.getWavefrontSize());
- Kern[".sgpr_count"] =
- Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumSGPR));
- Kern[".vgpr_count"] =
- Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumVGPR));
+ DelayedExprs->assignDocNode(Kern[".sgpr_count"], msgpack::Type::UInt,
+ ProgramInfo.NumSGPR);
+ DelayedExprs->assignDocNode(Kern[".vgpr_count"], msgpack::Type::UInt,
+ ProgramInfo.NumVGPR);
// Only add AGPR count to metadata for supported devices
if (STM.hasMAIInsts()) {
- Kern[".agpr_count"] =
- Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumAccVGPR));
+ DelayedExprs->assignDocNode(Kern[".agpr_count"], msgpack::Type::UInt,
+ ProgramInfo.NumAccVGPR);
}
Kern[".max_flat_workgroup_size"] =
@@ -527,6 +518,7 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
}
bool MetadataStreamerMsgPackV4::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
+ DelayedExprs->resolveDelayedExpressions();
return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true);
}
@@ -536,9 +528,11 @@ void MetadataStreamerMsgPackV4::begin(const Module &Mod,
emitTargetID(TargetID);
emitPrintf(Mod);
getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
+ DelayedExprs->clear();
}
void MetadataStreamerMsgPackV4::end() {
+ DelayedExprs->resolveDelayedExpressions();
std::string HSAMetadataString;
raw_string_ostream StrOS(HSAMetadataString);
HSAMetadataDoc->toYAML(StrOS);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 0e3bc63919f0..fd76666dc360 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -15,6 +15,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
+#include "Utils/AMDGPUDelayedMCExpr.h"
#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/Alignment.h"
@@ -65,6 +66,9 @@ protected:
class LLVM_EXTERNAL_VISIBILITY MetadataStreamerMsgPackV4
: public MetadataStreamer {
protected:
+ std::unique_ptr<DelayedMCExprs> DelayedExprs =
+ std::make_unique<DelayedMCExprs>();
+
std::unique_ptr<msgpack::Document> HSAMetadataDoc =
std::make_unique<msgpack::Document>();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 57769fe998d1..86f28a505769 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -1482,9 +1482,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
MFMAChains = 0;
for (auto &MFMAPipeSU : MFMAPipeSUs) {
- if (MFMAChainSeeds.size() &&
- std::find(MFMAChainSeeds.begin(), MFMAChainSeeds.end(), MFMAPipeSU) !=
- MFMAChainSeeds.end())
+ if (is_contained(MFMAChainSeeds, MFMAPipeSU))
continue;
if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(),
[&TII](SDep &Succ) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b50c0cc12626..6d5ffc66d98b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -502,9 +502,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
// isa<MemSDNode> almost works but is slightly too permissive for some DS
// intrinsics.
- if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMAX) {
+ if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
N = glueCopyToM0LDSInit(N);
SelectCode(N);
return;
@@ -2006,12 +2004,31 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return true;
}
+// For unbuffered smem loads, it is illegal for the Immediate Offset to be
+// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
+// Handle the case where the Immediate Offset + SOffset is negative.
+bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
+ bool Imm32Only,
+ bool IsBuffer,
+ int64_t ImmOffset) const {
+ if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
+ AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
+ KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
+ if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
+ return false;
+ }
+
+ return true;
+}
+
// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
// not null) offset. If Imm32Only is true, match only 32-bit immediate
// offsets available on CI.
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
SDValue *SOffset, SDValue *Offset,
- bool Imm32Only, bool IsBuffer) const {
+ bool Imm32Only, bool IsBuffer,
+ bool HasSOffset,
+ int64_t ImmOffset) const {
assert((!SOffset || !Offset) &&
"Cannot match both soffset and offset at the same time!");
@@ -2019,15 +2036,18 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
if (!C) {
if (!SOffset)
return false;
+
if (ByteOffsetNode.getValueType().isScalarInteger() &&
ByteOffsetNode.getValueType().getSizeInBits() == 32) {
*SOffset = ByteOffsetNode;
- return true;
+ return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
+ ImmOffset);
}
if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
*SOffset = ByteOffsetNode.getOperand(0);
- return true;
+ return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
+ ImmOffset);
}
}
return false;
@@ -2038,8 +2058,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
// GFX9 and GFX10 have signed byte immediate offsets. The immediate
// offset for S_BUFFER instructions is unsigned.
int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
- std::optional<int64_t> EncodedOffset =
- AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer);
+ std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
+ *Subtarget, ByteOffset, IsBuffer, HasSOffset);
if (EncodedOffset && Offset && !Imm32Only) {
*Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
return true;
@@ -2098,13 +2118,22 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
// true, match only 32-bit immediate offsets available on CI.
bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
SDValue *SOffset, SDValue *Offset,
- bool Imm32Only,
- bool IsBuffer) const {
+ bool Imm32Only, bool IsBuffer,
+ bool HasSOffset,
+ int64_t ImmOffset) const {
if (SOffset && Offset) {
assert(!Imm32Only && !IsBuffer);
SDValue B;
- return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
- SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
+
+ if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
+ return false;
+
+ int64_t ImmOff = 0;
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
+ ImmOff = C->getSExtValue();
+
+ return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
+ ImmOff);
}
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -2123,11 +2152,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
}
if (!N0 || !N1)
return false;
- if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {
+
+ if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+ ImmOffset)) {
SBase = N0;
return true;
}
- if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
+ if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+ ImmOffset)) {
SBase = N1;
return true;
}
@@ -2551,14 +2583,6 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
}
-void AMDGPUDAGToDAGISel::SelectPOPSExitingWaveID(SDNode *N) {
- // TODO: Select this with a tablegen pattern. This is tricky because the
- // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked
- // mayLoad/mayStore and tablegen complains about the mismatch.
- SDValue Reg = CurDAG->getRegister(AMDGPU::SRC_POPS_EXITING_WAVE_ID, MVT::i32);
- CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, N->getVTList(), Reg);
-}
-
static unsigned gwsIntrinToOpcode(unsigned IntrID) {
switch (IntrID) {
case Intrinsic::amdgcn_ds_gws_init:
@@ -2715,9 +2739,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
SelectDSBvhStackIntrinsic(N);
return;
- case Intrinsic::amdgcn_pops_exiting_wave_id:
- SelectPOPSExitingWaveID(N);
- return;
}
SelectCode(N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 8e5662a3cd81..e7911bc1793d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -24,10 +24,6 @@ using namespace llvm;
namespace {
-static inline bool isNullConstantOrUndef(SDValue V) {
- return V.isUndef() || isNullConstant(V);
-}
-
static inline bool getConstantValue(SDValue N, uint32_t &Out) {
// This is only used for packed vectors, where using 0 for undef should
// always be good.
@@ -136,6 +132,8 @@ private:
bool isFlatScratchBaseLegal(SDValue Addr) const;
bool isFlatScratchBaseLegalSV(SDValue Addr) const;
bool isFlatScratchBaseLegalSVImm(SDValue Addr) const;
+ bool isSOffsetLegalWithImmOffset(SDValue *SOffset, bool Imm32Only,
+ bool IsBuffer, int64_t ImmOffset = 0) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
@@ -178,11 +176,13 @@ private:
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset,
SDValue *Offset, bool Imm32Only = false,
- bool IsBuffer = false) const;
+ bool IsBuffer = false, bool HasSOffset = false,
+ int64_t ImmOffset = 0) const;
SDValue Expand32BitAddress(SDValue Addr) const;
bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
SDValue *Offset, bool Imm32Only = false,
- bool IsBuffer = false) const;
+ bool IsBuffer = false, bool HasSOffset = false,
+ int64_t ImmOffset = 0) const;
bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
SDValue *Offset, bool Imm32Only = false) const;
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
@@ -194,6 +194,8 @@ private:
bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const;
bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
SDValue &Offset) const;
+ bool SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase,
+ SDValue &Offset) const;
bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
@@ -267,7 +269,6 @@ private:
void SelectFP_EXTEND(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
void SelectDSBvhStackIntrinsic(SDNode *N);
- void SelectPOPSExitingWaveID(SDNode *N);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
void SelectInterpP1F16(SDNode *N);
void SelectINTRINSIC_W_CHAIN(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 375643b7f519..522b3a34161c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -42,8 +42,10 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
if (StoreSize <= 32)
return EVT::getIntegerVT(Ctx, StoreSize);
- assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
- return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+ if (StoreSize % 32 == 0)
+ return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+
+ return VT;
}
unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
@@ -5522,8 +5524,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(DS_ORDERED_COUNT)
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
- NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
- NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
NODE_NAME_CASE(BUFFER_LOAD)
NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
NODE_NAME_CASE(BUFFER_LOAD_USHORT)
@@ -5562,7 +5562,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
- NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 71c4334029b4..37572af3897f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -575,8 +575,6 @@ enum NodeType : unsigned {
TBUFFER_LOAD_FORMAT_D16,
DS_ORDERED_COUNT,
ATOMIC_CMP_SWAP,
- ATOMIC_LOAD_FMIN,
- ATOMIC_LOAD_FMAX,
BUFFER_LOAD,
BUFFER_LOAD_UBYTE,
BUFFER_LOAD_USHORT,
@@ -615,7 +613,6 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_CMPSWAP,
BUFFER_ATOMIC_CSUB,
BUFFER_ATOMIC_FADD,
- BUFFER_ATOMIC_FADD_BF16,
BUFFER_ATOMIC_FMIN,
BUFFER_ATOMIC_FMAX,
BUFFER_ATOMIC_COND_SUB_U32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
index b78952ca3a62..43b3bf43fe56 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -15,6 +15,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUGenSearchableTables.inc"
#include "GCNSubtarget.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
@@ -214,12 +215,14 @@ public:
RegisterUseCount[Unit]++;
// Do not attempt to optimise across exec mask changes.
- if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
+ if (MI.modifiesRegister(AMDGPU::EXEC, TRI) ||
+ AMDGPU::isInvalidSingleUseConsumerInst(MI.getOpcode())) {
for (auto &UsedReg : RegisterUseCount)
UsedReg.second = 2;
}
- if (!SIInstrInfo::isVALU(MI))
+ if (!SIInstrInfo::isVALU(MI) ||
+ AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode()))
continue;
if (AllProducerOperandsAreSingleUse) {
SingleUseProducerPositions.push_back({VALUInstrCount, &MI});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 160a17584ca3..93bca4402ed2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1158,12 +1158,10 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
break;
}
- case Intrinsic::amdgcn_buffer_store_format:
case Intrinsic::amdgcn_raw_buffer_store_format:
case Intrinsic::amdgcn_struct_buffer_store_format:
case Intrinsic::amdgcn_raw_tbuffer_store:
case Intrinsic::amdgcn_struct_tbuffer_store:
- case Intrinsic::amdgcn_tbuffer_store:
case Intrinsic::amdgcn_image_store_1d:
case Intrinsic::amdgcn_image_store_1darray:
case Intrinsic::amdgcn_image_store_2d:
@@ -1376,8 +1374,6 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
std::function<void(Instruction *, unsigned, APInt, APInt &)>
SimplifyAndSetOp) const {
switch (II.getIntrinsicID()) {
- case Intrinsic::amdgcn_buffer_load:
- case Intrinsic::amdgcn_buffer_load_format:
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
@@ -1391,7 +1387,6 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
case Intrinsic::amdgcn_struct_tbuffer_load:
case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
- case Intrinsic::amdgcn_tbuffer_load:
return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
default: {
if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index ae3f2b87f353..a3cb3b3f47e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2079,21 +2079,6 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
-bool AMDGPUInstructionSelector::selectPOPSExitingWaveID(
- MachineInstr &MI) const {
- Register Dst = MI.getOperand(0).getReg();
- const DebugLoc &DL = MI.getDebugLoc();
- MachineBasicBlock *MBB = MI.getParent();
-
- // TODO: Select this with a tablegen pattern. This is tricky because the
- // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked
- // mayLoad/mayStore and tablegen complains about the mismatch.
- auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
- .addReg(AMDGPU::SRC_POPS_EXITING_WAVE_ID);
- MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
-}
-
bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
MachineInstr &I) const {
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
@@ -2144,8 +2129,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectSBarrierSignalIsfirst(I, IntrinsicID);
case Intrinsic::amdgcn_s_barrier_leave:
return selectSBarrierLeave(I);
- case Intrinsic::amdgcn_pops_exiting_wave_id:
- return selectPOPSExitingWaveID(I);
}
return selectImpl(I, *CoverageInfo);
}
@@ -3620,8 +3603,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
case TargetOpcode::G_ATOMICRMW_FADD:
- case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
- case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
+ case TargetOpcode::G_ATOMICRMW_FMIN:
+ case TargetOpcode::G_ATOMICRMW_FMAX:
return selectG_LOAD_STORE_ATOMICRMW(I);
case TargetOpcode::G_SELECT:
return selectG_SELECT(I);
@@ -4216,10 +4199,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
return false;
const GEPInfo &GEPI = AddrInfo[0];
- std::optional<int64_t> EncodedImm =
- AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
+ std::optional<int64_t> EncodedImm;
if (SOffset && Offset) {
+ EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
+ /*HasSOffset=*/true);
if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
AddrInfo.size() > 1) {
const GEPInfo &GEPI2 = AddrInfo[1];
@@ -4229,6 +4213,17 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
Base = GEPI2.SgprParts[0];
*SOffset = OffsetReg;
*Offset = *EncodedImm;
+ if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
+ return true;
+
+ // For unbuffered smem loads, it is illegal for the Immediate Offset
+ // to be negative if the resulting (Offset + (M0 or SOffset or zero)
+ // is negative. Handle the case where the Immediate Offset + SOffset
+ // is negative.
+ auto SKnown = KB->getKnownBits(*SOffset);
+ if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
+ return false;
+
return true;
}
}
@@ -4236,6 +4231,8 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
return false;
}
+ EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
+ /*HasSOffset=*/false);
if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
Base = GEPI.SgprParts[0];
*Offset = *EncodedImm;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 48f3b1811801..f561d5d29efc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -125,7 +125,6 @@ private:
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
bool selectSBarrier(MachineInstr &MI) const;
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
- bool selectPOPSExitingWaveID(MachineInstr &MI) const;
bool selectImageIntrinsic(MachineInstr &MI,
const AMDGPU::ImageDimIntrinsicInfo *Intr) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index fa7492ac6cbe..c6dbc58395e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -140,7 +140,9 @@ class ImmOperand<ValueType type, string name = NAME, bit optional = 0,
let PrintMethod = printer;
}
-def s16imm : ImmOperand<i16, "S16Imm", 0, "printU16ImmOperand">;
+class S16ImmOperand : ImmOperand<i16, "S16Imm", 0, "printU16ImmOperand">;
+
+def s16imm : S16ImmOperand;
def u16imm : ImmOperand<i16, "U16Imm", 0, "printU16ImmOperand">;
class ValuePredicatedOperand<CustomOperand op, string valuePredicate,
@@ -616,6 +618,7 @@ multiclass local_addr_space_atomic_op {
}
}
+defm int_amdgcn_flat_atomic_fadd : noret_op;
defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op;
defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op;
defm int_amdgcn_flat_atomic_fmin : noret_op;
@@ -627,7 +630,6 @@ defm int_amdgcn_global_atomic_fmin : noret_op;
defm int_amdgcn_global_atomic_fmax : noret_op;
defm int_amdgcn_global_atomic_csub : noret_op;
defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
-defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
defm int_amdgcn_flat_atomic_fmin_num : noret_op;
defm int_amdgcn_flat_atomic_fmax_num : noret_op;
@@ -637,9 +639,14 @@ defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op;
defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op;
defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op;
-multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
+multiclass noret_binary_atomic_op<SDNode atomic_op> {
let HasNoUse = true in
- defm "_noret" : binary_atomic_op<atomic_op, IsInt>;
+ defm "_noret" : binary_atomic_op<atomic_op>;
+}
+
+multiclass noret_binary_atomic_op_fp<SDNode atomic_op> {
+ let HasNoUse = true in
+ defm "_noret" : binary_atomic_op_fp<atomic_op>;
}
multiclass noret_ternary_atomic_op<SDNode atomic_op> {
@@ -647,11 +654,21 @@ multiclass noret_ternary_atomic_op<SDNode atomic_op> {
defm "_noret" : ternary_atomic_op<atomic_op>;
}
-multiclass binary_atomic_op_all_as<SDNode atomic_op, bit IsInt = 1> {
- foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
+defvar atomic_addrspace_names = [ "global", "flat", "constant", "local", "private", "region" ];
+
+multiclass binary_atomic_op_all_as<SDNode atomic_op> {
+ foreach as = atomic_addrspace_names in {
+ let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
+ defm "_"#as : binary_atomic_op<atomic_op>;
+ defm "_"#as : noret_binary_atomic_op<atomic_op>;
+ }
+ }
+}
+multiclass binary_atomic_op_fp_all_as<SDNode atomic_op> {
+ foreach as = atomic_addrspace_names in {
let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
- defm "_"#as : binary_atomic_op<atomic_op, IsInt>;
- defm "_"#as : noret_binary_atomic_op<atomic_op, IsInt>;
+ defm "_"#as : binary_atomic_op_fp<atomic_op>;
+ defm "_"#as : noret_binary_atomic_op_fp<atomic_op>;
}
}
}
@@ -666,11 +683,11 @@ defm atomic_load_sub : binary_atomic_op_all_as<atomic_load_sub>;
defm atomic_load_umax : binary_atomic_op_all_as<atomic_load_umax>;
defm atomic_load_umin : binary_atomic_op_all_as<atomic_load_umin>;
defm atomic_load_xor : binary_atomic_op_all_as<atomic_load_xor>;
-defm atomic_load_fadd : binary_atomic_op_all_as<atomic_load_fadd, 0>;
+defm atomic_load_fadd : binary_atomic_op_fp_all_as<atomic_load_fadd>;
+defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
+defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
-let MemoryVT = v2f16 in
-defm atomic_load_fadd_v2f16 : binary_atomic_op_all_as<atomic_load_fadd, 0>;
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ee7fb20c23aa..f1254b2e9e1d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -283,7 +283,9 @@ static const LLT S1 = LLT::scalar(1);
static const LLT S8 = LLT::scalar(8);
static const LLT S16 = LLT::scalar(16);
static const LLT S32 = LLT::scalar(32);
+static const LLT F32 = LLT::float32();
static const LLT S64 = LLT::scalar(64);
+static const LLT F64 = LLT::float64();
static const LLT S96 = LLT::scalar(96);
static const LLT S128 = LLT::scalar(128);
static const LLT S160 = LLT::scalar(160);
@@ -301,6 +303,9 @@ static const LLT V10S16 = LLT::fixed_vector(10, 16);
static const LLT V12S16 = LLT::fixed_vector(12, 16);
static const LLT V16S16 = LLT::fixed_vector(16, 16);
+static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16());
+static const LLT V2BF16 = V2F16; // FIXME
+
static const LLT V2S32 = LLT::fixed_vector(2, 32);
static const LLT V3S32 = LLT::fixed_vector(3, 32);
static const LLT V4S32 = LLT::fixed_vector(4, 32);
@@ -1638,13 +1643,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasLdsAtomicAddF64())
Atomic.legalFor({{S64, LocalPtr}});
if (ST.hasAtomicDsPkAdd16Insts())
- Atomic.legalFor({{V2S16, LocalPtr}});
+ Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
}
if (ST.hasAtomicFaddInsts())
Atomic.legalFor({{S32, GlobalPtr}});
if (ST.hasFlatAtomicFaddF32Inst())
Atomic.legalFor({{S32, FlatPtr}});
+ getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
+ .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
+
if (ST.hasGFX90AInsts()) {
// These are legal with some caveats, and should have undergone expansion in
// the IR in most situations
@@ -1656,6 +1664,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
});
}
+ if (ST.hasAtomicBufferGlobalPkAddF16Insts())
+ Atomic.legalFor({{V2F16, GlobalPtr}});
+ if (ST.hasAtomicGlobalPkAddBF16Inst())
+ Atomic.legalFor({{V2BF16, GlobalPtr}});
+ if (ST.hasAtomicFlatPkAdd16Insts())
+ Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
+
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
// demarshalling
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
@@ -5388,12 +5403,10 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
switch (IID) {
- case Intrinsic::amdgcn_ds_fadd:
- return AMDGPU::G_ATOMICRMW_FADD;
case Intrinsic::amdgcn_ds_fmin:
- return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
+ return AMDGPU::G_ATOMICRMW_FMIN;
case Intrinsic::amdgcn_ds_fmax:
- return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
+ return AMDGPU::G_ATOMICRMW_FMAX;
default:
llvm_unreachable("not a DS FP intrinsic");
}
@@ -5417,6 +5430,126 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
return true;
}
+// TODO: Fix pointer type handling
+bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
+ MachineInstr &MI,
+ Intrinsic::ID IID) const {
+
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *B.getMRI();
+
+ bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
+ IID == Intrinsic::amdgcn_permlanex16;
+
+ auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
+ Register Src2, LLT VT) -> Register {
+ auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
+ switch (IID) {
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_permlane64:
+ return LaneOp.getReg(0);
+ case Intrinsic::amdgcn_readlane:
+ return LaneOp.addUse(Src1).getReg(0);
+ case Intrinsic::amdgcn_writelane:
+ return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16: {
+ Register Src3 = MI.getOperand(5).getReg();
+ Register Src4 = MI.getOperand(6).getImm();
+ Register Src5 = MI.getOperand(7).getImm();
+ return LaneOp.addUse(Src1)
+ .addUse(Src2)
+ .addUse(Src3)
+ .addImm(Src4)
+ .addImm(Src5)
+ .getReg(0);
+ }
+ default:
+ llvm_unreachable("unhandled lane op");
+ }
+ };
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(2).getReg();
+ Register Src1, Src2;
+ if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
+ IsPermLane16) {
+ Src1 = MI.getOperand(3).getReg();
+ if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
+ Src2 = MI.getOperand(4).getReg();
+ }
+ }
+
+ LLT Ty = MRI.getType(DstReg);
+ unsigned Size = Ty.getSizeInBits();
+
+ if (Size == 32) {
+ // Already legal
+ return true;
+ }
+
+ if (Size < 32) {
+ Src0 = B.buildAnyExt(S32, Src0).getReg(0);
+
+ if (IsPermLane16)
+ Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
+
+ if (IID == Intrinsic::amdgcn_writelane)
+ Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
+
+ Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
+ B.buildTrunc(DstReg, LaneOpDst);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ if (Size % 32 != 0)
+ return false;
+
+ LLT PartialResTy = S32;
+ if (Ty.isVector()) {
+ LLT EltTy = Ty.getElementType();
+ switch (EltTy.getSizeInBits()) {
+ case 16:
+ PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
+ break;
+ case 32:
+ PartialResTy = EltTy;
+ break;
+ default:
+ // Handle all other cases via S32 pieces;
+ break;
+ }
+ }
+
+ SmallVector<Register, 2> PartialRes;
+ unsigned NumParts = Size / 32;
+ MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
+ MachineInstrBuilder Src1Parts, Src2Parts;
+
+ if (IsPermLane16)
+ Src1Parts = B.buildUnmerge(PartialResTy, Src1);
+
+ if (IID == Intrinsic::amdgcn_writelane)
+ Src2Parts = B.buildUnmerge(PartialResTy, Src2);
+
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Src0 = Src0Parts.getReg(i);
+
+ if (IsPermLane16)
+ Src1 = Src1Parts.getReg(i);
+
+ if (IID == Intrinsic::amdgcn_writelane)
+ Src2 = Src2Parts.getReg(i);
+
+ PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
+ }
+
+ B.buildMergeLikeInstr(DstReg, PartialRes);
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -6008,9 +6141,6 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
- case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
- case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
- return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
@@ -6630,9 +6760,9 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
MI.removeOperand(1); // Remove intrinsic ID
// FIXME: When intrinsic definition is fixed, this should have an MMO already.
- // TODO: Should this use datalayout alignment?
const unsigned MemSize = (Size + 7) / 8;
- const Align MemAlign(std::min(MemSize, 4u));
+ const Align MemAlign = B.getDataLayout().getABITypeAlign(
+ getTypeForLLT(Ty, MF.getFunction().getContext()));
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo(),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
@@ -7318,14 +7448,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
- case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
- case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
- case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
- case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
return legalizeBufferAtomic(MI, B, IntrID);
case Intrinsic::amdgcn_rsq_clamp:
return legalizeRsqClampIntrinsic(MI, MRI, B);
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
@@ -7365,6 +7490,13 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
Observer.changedInstr(MI);
return true;
}
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_writelane:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ case Intrinsic::amdgcn_permlane64:
+ return legalizeLaneOp(Helper, MI, IntrID);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 4b1d821dadc2..ae01bb29c110 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -210,6 +210,9 @@ public:
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
Intrinsic::ID IID) const;
+ bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI,
+ Intrinsic::ID IID) const;
+
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index c515138d95a2..456f3cb332cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1129,15 +1129,11 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
nval = CreateCallEx(B,ExpExpr, nval, "__exp2");
if (needcopysign) {
- Value *opr_n;
- Type* rTy = opr0->getType();
Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits());
- Type *nTy = nTyS;
- if (const auto *vTy = dyn_cast<FixedVectorType>(rTy))
- nTy = FixedVectorType::get(nTyS, vTy);
+ Type *nTy = FPOp->getType()->getWithNewType(nTyS);
unsigned size = nTy->getScalarSizeInBits();
- opr_n = FPOp->getOperand(1);
- if (opr_n->getType()->isIntegerTy())
+ Value *opr_n = FPOp->getOperand(1);
+ if (opr_n->getType()->getScalarType()->isIntegerTy())
opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou");
else
opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index f878bd9465d3..a8f6ad09fe28 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -200,6 +200,7 @@
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Utils/Local.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/AttributeMask.h"
#include "llvm/IR/Constants.h"
@@ -214,6 +215,7 @@
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ReplaceConstant.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/AtomicOrdering.h"
@@ -578,18 +580,14 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
/// buffer fat pointer constant.
static std::pair<Constant *, Constant *>
splitLoweredFatBufferConst(Constant *C) {
- if (auto *AZ = dyn_cast<ConstantAggregateZero>(C))
- return std::make_pair(AZ->getStructElement(0), AZ->getStructElement(1));
- if (auto *SC = dyn_cast<ConstantStruct>(C))
- return std::make_pair(SC->getOperand(0), SC->getOperand(1));
- llvm_unreachable("Conversion should've created a {p8, i32} struct");
+ assert(isSplitFatPtr(C->getType()) && "Not a split fat buffer pointer");
+ return std::make_pair(C->getAggregateElement(0u), C->getAggregateElement(1u));
}
namespace {
/// Handle the remapping of ptr addrspace(7) constants.
class FatPtrConstMaterializer final : public ValueMaterializer {
BufferFatPtrToStructTypeMap *TypeMap;
- BufferFatPtrToIntTypeMap *IntTypeMap;
// An internal mapper that is used to recurse into the arguments of constants.
// While the documentation for `ValueMapper` specifies not to use it
// recursively, examination of the logic in mapValue() shows that it can
@@ -599,16 +597,12 @@ class FatPtrConstMaterializer final : public ValueMaterializer {
Constant *materializeBufferFatPtrConst(Constant *C);
- const DataLayout &DL;
-
public:
// UnderlyingMap is the value map this materializer will be filling.
FatPtrConstMaterializer(BufferFatPtrToStructTypeMap *TypeMap,
- ValueToValueMapTy &UnderlyingMap,
- BufferFatPtrToIntTypeMap *IntTypeMap,
- const DataLayout &DL)
- : TypeMap(TypeMap), IntTypeMap(IntTypeMap),
- InternalMapper(UnderlyingMap, RF_None, TypeMap, this), DL(DL) {}
+ ValueToValueMapTy &UnderlyingMap)
+ : TypeMap(TypeMap),
+ InternalMapper(UnderlyingMap, RF_None, TypeMap, this) {}
virtual ~FatPtrConstMaterializer() = default;
Value *materialize(Value *V) override;
@@ -631,10 +625,6 @@ Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) {
UndefValue::get(NewTy->getElementType(1))});
}
- if (isa<GlobalValue>(C))
- report_fatal_error("Global values containing ptr addrspace(7) (buffer "
- "fat pointer) values are not supported");
-
if (auto *VC = dyn_cast<ConstantVector>(C)) {
if (Constant *S = VC->getSplatValue()) {
Constant *NewS = InternalMapper.mapConstant(*S);
@@ -660,127 +650,14 @@ Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) {
return ConstantStruct::get(NewTy, {RsrcVec, OffVec});
}
- // Constant expressions. This code mirrors how we fix up the equivalent
- // instructions later.
- auto *CE = dyn_cast<ConstantExpr>(C);
- if (!CE)
- return nullptr;
- if (auto *GEPO = dyn_cast<GEPOperator>(C)) {
- Constant *RemappedPtr =
- InternalMapper.mapConstant(*cast<Constant>(GEPO->getPointerOperand()));
- auto [Rsrc, Off] = splitLoweredFatBufferConst(RemappedPtr);
- Type *OffTy = Off->getType();
- bool InBounds = GEPO->isInBounds();
-
- MapVector<Value *, APInt> VariableOffs;
- APInt NewConstOffVal = APInt::getZero(BufferOffsetWidth);
- if (!GEPO->collectOffset(DL, BufferOffsetWidth, VariableOffs,
- NewConstOffVal))
- report_fatal_error(
- "Scalable vector or unsized struct in fat pointer GEP");
- Constant *OffAccum = nullptr;
- // Accumulate offsets together before adding to the base in order to
- // preserve as many of the inbounds properties as possible.
- for (auto [Arg, Multiple] : VariableOffs) {
- Constant *NewArg = InternalMapper.mapConstant(*cast<Constant>(Arg));
- NewArg = ConstantFoldIntegerCast(NewArg, OffTy, /*IsSigned=*/true, DL);
- if (!Multiple.isOne()) {
- if (Multiple.isPowerOf2()) {
- NewArg = ConstantExpr::getShl(
- NewArg,
- CE->getIntegerValue(
- OffTy, APInt(BufferOffsetWidth, Multiple.logBase2())),
- /*hasNUW=*/InBounds, /*HasNSW=*/InBounds);
- } else {
- NewArg =
- ConstantExpr::getMul(NewArg, CE->getIntegerValue(OffTy, Multiple),
- /*hasNUW=*/InBounds, /*hasNSW=*/InBounds);
- }
- }
- if (OffAccum) {
- OffAccum = ConstantExpr::getAdd(OffAccum, NewArg, /*hasNUW=*/InBounds,
- /*hasNSW=*/InBounds);
- } else {
- OffAccum = NewArg;
- }
- }
- Constant *NewConstOff = CE->getIntegerValue(OffTy, NewConstOffVal);
- if (OffAccum)
- OffAccum = ConstantExpr::getAdd(OffAccum, NewConstOff,
- /*hasNUW=*/InBounds, /*hasNSW=*/InBounds);
- else
- OffAccum = NewConstOff;
- bool HasNonNegativeOff = false;
- if (auto *CI = dyn_cast<ConstantInt>(OffAccum)) {
- HasNonNegativeOff = !CI->isNegative();
- }
- Constant *NewOff = ConstantExpr::getAdd(
- Off, OffAccum, /*hasNUW=*/InBounds && HasNonNegativeOff,
- /*hasNSW=*/false);
- return ConstantStruct::get(NewTy, {Rsrc, NewOff});
- }
-
- if (auto *PI = dyn_cast<PtrToIntOperator>(CE)) {
- Constant *Parts =
- InternalMapper.mapConstant(*cast<Constant>(PI->getPointerOperand()));
- auto [Rsrc, Off] = splitLoweredFatBufferConst(Parts);
- // Here, we take advantage of the fact that ptrtoint has a built-in
- // zero-extension behavior.
- unsigned FatPtrWidth =
- DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER);
- Constant *RsrcInt = CE->getPtrToInt(Rsrc, SrcTy);
- unsigned Width = SrcTy->getScalarSizeInBits();
- Constant *Shift =
- CE->getIntegerValue(SrcTy, APInt(Width, BufferOffsetWidth));
- Constant *OffCast =
- ConstantFoldIntegerCast(Off, SrcTy, /*IsSigned=*/false, DL);
- Constant *RsrcHi = ConstantExpr::getShl(
- RsrcInt, Shift, Width >= FatPtrWidth, Width > FatPtrWidth);
- // This should be an or, but those got recently removed.
- Constant *Result = ConstantExpr::getAdd(RsrcHi, OffCast, true, true);
- return Result;
- }
+ if (isa<GlobalValue>(C))
+ report_fatal_error("Global values containing ptr addrspace(7) (buffer "
+ "fat pointer) values are not supported");
- if (CE->getOpcode() == Instruction::IntToPtr) {
- auto *Arg = cast<Constant>(CE->getOperand(0));
- unsigned FatPtrWidth =
- DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER);
- unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE);
- auto *WantedTy = Arg->getType()->getWithNewBitWidth(FatPtrWidth);
- Arg = ConstantFoldIntegerCast(Arg, WantedTy, /*IsSigned=*/false, DL);
-
- Constant *Shift =
- CE->getIntegerValue(WantedTy, APInt(FatPtrWidth, BufferOffsetWidth));
- Type *RsrcIntType = WantedTy->getWithNewBitWidth(RsrcPtrWidth);
- Type *RsrcTy = NewTy->getElementType(0);
- Type *OffTy = WantedTy->getWithNewBitWidth(BufferOffsetWidth);
- Constant *RsrcInt = CE->getTrunc(
- ConstantFoldBinaryOpOperands(Instruction::LShr, Arg, Shift, DL),
- RsrcIntType);
- Constant *Rsrc = CE->getIntToPtr(RsrcInt, RsrcTy);
- Constant *Off = ConstantFoldIntegerCast(Arg, OffTy, /*isSigned=*/false, DL);
-
- return ConstantStruct::get(NewTy, {Rsrc, Off});
- }
+ if (isa<ConstantExpr>(C))
+ report_fatal_error("Constant exprs containing ptr addrspace(7) (buffer "
+ "fat pointer) values should have been expanded earlier");
- if (auto *AC = dyn_cast<AddrSpaceCastOperator>(CE)) {
- unsigned SrcAS = AC->getSrcAddressSpace();
- unsigned DstAS = AC->getDestAddressSpace();
- auto *Arg = cast<Constant>(AC->getPointerOperand());
- auto *NewArg = InternalMapper.mapConstant(*Arg);
- if (!NewArg)
- return nullptr;
- if (SrcAS == AMDGPUAS::BUFFER_FAT_POINTER &&
- DstAS == AMDGPUAS::BUFFER_FAT_POINTER)
- return NewArg;
- if (SrcAS == AMDGPUAS::BUFFER_RESOURCE &&
- DstAS == AMDGPUAS::BUFFER_FAT_POINTER) {
- auto *NullOff = CE->getNullValue(NewTy->getElementType(1));
- return ConstantStruct::get(NewTy, {NewArg, NullOff});
- }
- report_fatal_error(
- "Unsupported address space cast for a buffer fat pointer");
- }
return nullptr;
}
@@ -788,26 +665,6 @@ Value *FatPtrConstMaterializer::materialize(Value *V) {
Constant *C = dyn_cast<Constant>(V);
if (!C)
return nullptr;
- if (auto *GEPO = dyn_cast<GEPOperator>(C)) {
- // As a special case, adjust GEP constants that have a ptr addrspace(7) in
- // their source types here, since the earlier local changes didn't handle
- // htis.
- Type *SrcTy = GEPO->getSourceElementType();
- Type *NewSrcTy = IntTypeMap->remapType(SrcTy);
- if (SrcTy != NewSrcTy) {
- SmallVector<Constant *> Ops;
- Ops.reserve(GEPO->getNumOperands());
- for (const Use &U : GEPO->operands())
- Ops.push_back(cast<Constant>(U.get()));
- auto *NewGEP = ConstantExpr::getGetElementPtr(
- NewSrcTy, Ops[0], ArrayRef<Constant *>(Ops).slice(1),
- GEPO->getNoWrapFlags(), GEPO->getInRange());
- LLVM_DEBUG(dbgs() << "p7-getting GEP: " << *GEPO << " becomes " << *NewGEP
- << "\n");
- Value *FurtherMap = materialize(NewGEP);
- return FurtherMap ? FurtherMap : NewGEP;
- }
- }
// Structs and other types that happen to contain fat pointers get remapped
// by the mapValue() logic.
if (!isBufferFatPtrConst(C))
@@ -1387,57 +1244,25 @@ PtrParts SplitPtrStructs::visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI) {
}
PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+ using namespace llvm::PatternMatch;
Value *Ptr = GEP.getPointerOperand();
if (!isSplitFatPtr(Ptr->getType()))
return {nullptr, nullptr};
IRB.SetInsertPoint(&GEP);
auto [Rsrc, Off] = getPtrParts(Ptr);
- Type *OffTy = Off->getType();
const DataLayout &DL = GEP.getModule()->getDataLayout();
bool InBounds = GEP.isInBounds();
- // In order to call collectOffset() and thus not have to reimplement it,
- // we need the GEP's pointer operand to have ptr addrspace(7) type
- GEP.setOperand(GEP.getPointerOperandIndex(),
- PoisonValue::get(IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER)));
- MapVector<Value *, APInt> VariableOffs;
- APInt ConstOffVal = APInt::getZero(BufferOffsetWidth);
- if (!GEP.collectOffset(DL, BufferOffsetWidth, VariableOffs, ConstOffVal))
- report_fatal_error("Scalable vector or unsized struct in fat pointer GEP");
- GEP.setOperand(GEP.getPointerOperandIndex(), Ptr);
- Value *OffAccum = nullptr;
- // Accumulate offsets together before adding to the base in order to preserve
- // as many of the inbounds properties as possible.
- for (auto [Arg, Multiple] : VariableOffs) {
- if (auto *OffVecTy = dyn_cast<VectorType>(OffTy))
- if (!Arg->getType()->isVectorTy())
- Arg = IRB.CreateVectorSplat(OffVecTy->getElementCount(), Arg);
- Arg = IRB.CreateIntCast(Arg, OffTy, /*isSigned=*/true);
- if (!Multiple.isOne()) {
- if (Multiple.isPowerOf2())
- Arg = IRB.CreateShl(Arg, Multiple.logBase2(), "", /*hasNUW=*/InBounds,
- /*HasNSW=*/InBounds);
- else
- Arg = IRB.CreateMul(Arg, ConstantExpr::getIntegerValue(OffTy, Multiple),
- "", /*hasNUW=*/InBounds, /*hasNSW=*/InBounds);
- }
- if (OffAccum)
- OffAccum = IRB.CreateAdd(OffAccum, Arg, "", /*hasNUW=*/InBounds,
- /*hasNSW=*/InBounds);
- else
- OffAccum = Arg;
- }
- if (!ConstOffVal.isZero()) {
- Constant *ConstOff = ConstantExpr::getIntegerValue(OffTy, ConstOffVal);
- if (OffAccum)
- OffAccum = IRB.CreateAdd(OffAccum, ConstOff, "", /*hasNUW=*/InBounds,
- /*hasNSW=*/InBounds);
- else
- OffAccum = ConstOff;
- }
-
- if (!OffAccum) { // Constant-zero offset
+ // In order to call emitGEPOffset() and thus not have to reimplement it,
+ // we need the GEP result to have ptr addrspace(7) type.
+ Type *FatPtrTy = IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER);
+ if (auto *VT = dyn_cast<VectorType>(Off->getType()))
+ FatPtrTy = VectorType::get(FatPtrTy, VT->getElementCount());
+ GEP.mutateType(FatPtrTy);
+ Value *OffAccum = emitGEPOffset(&IRB, DL, &GEP);
+ GEP.mutateType(Ptr->getType());
+ if (match(OffAccum, m_Zero())) { // Constant-zero offset
SplitUsers.insert(&GEP);
return {Rsrc, Off};
}
@@ -1447,7 +1272,7 @@ PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) {
HasNonNegativeOff = !CI->isNegative();
}
Value *NewOff;
- if (PatternMatch::match(Off, PatternMatch::is_zero())) {
+ if (match(Off, m_Zero())) {
NewOff = OffAccum;
} else {
NewOff = IRB.CreateAdd(Off, OffAccum, "",
@@ -1473,20 +1298,22 @@ PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) {
const DataLayout &DL = PI.getModule()->getDataLayout();
unsigned FatPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER);
- Value *RsrcInt;
- if (Width <= BufferOffsetWidth)
- RsrcInt = ConstantExpr::getIntegerValue(ResTy, APInt::getZero(Width));
- else
- RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc");
- copyMetadata(RsrcInt, &PI);
-
- Value *Shl = IRB.CreateShl(
- RsrcInt,
- ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)), "",
- Width >= FatPtrWidth, Width > FatPtrWidth);
- Value *OffCast =
- IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, PI.getName() + ".off");
- Value *Res = IRB.CreateOr(Shl, OffCast);
+ Value *Res;
+ if (Width <= BufferOffsetWidth) {
+ Res = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false,
+ PI.getName() + ".off");
+ } else {
+ Value *RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc");
+ Value *Shl = IRB.CreateShl(
+ RsrcInt,
+ ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)),
+ "", Width >= FatPtrWidth, Width > FatPtrWidth);
+ Value *OffCast = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false,
+ PI.getName() + ".off");
+ Res = IRB.CreateOr(Shl, OffCast);
+ }
+
+ copyMetadata(Res, &PI);
Res->takeName(&PI);
SplitUsers.insert(&PI);
PI.replaceAllUsesWith(Res);
@@ -1818,14 +1645,9 @@ public:
static bool containsBufferFatPointers(const Function &F,
BufferFatPtrToStructTypeMap *TypeMap) {
bool HasFatPointers = false;
- for (const BasicBlock &BB : F) {
- for (const Instruction &I : BB) {
+ for (const BasicBlock &BB : F)
+ for (const Instruction &I : BB)
HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType()));
- for (const Use &U : I.operands())
- if (auto *C = dyn_cast<Constant>(U.get()))
- HasFatPointers |= isBufferFatPtrConst(C);
- }
- }
return HasFatPointers;
}
@@ -1924,6 +1746,36 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
"buffer resource pointers (address space 8) instead.");
}
+ {
+ // Collect all constant exprs and aggregates referenced by any function.
+ SmallVector<Constant *, 8> Worklist;
+ for (Function &F : M.functions())
+ for (Instruction &I : instructions(F))
+ for (Value *Op : I.operands())
+ if (isa<ConstantExpr>(Op) || isa<ConstantAggregate>(Op))
+ Worklist.push_back(cast<Constant>(Op));
+
+ // Recursively look for any referenced buffer pointer constants.
+ SmallPtrSet<Constant *, 8> Visited;
+ SetVector<Constant *> BufferFatPtrConsts;
+ while (!Worklist.empty()) {
+ Constant *C = Worklist.pop_back_val();
+ if (!Visited.insert(C).second)
+ continue;
+ if (isBufferFatPtrOrVector(C->getType()))
+ BufferFatPtrConsts.insert(C);
+ for (Value *Op : C->operands())
+ if (isa<ConstantExpr>(Op) || isa<ConstantAggregate>(Op))
+ Worklist.push_back(cast<Constant>(Op));
+ }
+
+ // Expand all constant expressions using fat buffer pointers to
+ // instructions.
+ Changed |= convertUsersOfConstantsToInstructions(
+ BufferFatPtrConsts.getArrayRef(), /*RestrictToFunc=*/nullptr,
+ /*RemoveDeadConstants=*/false, /*IncludeSelf=*/true);
+ }
+
StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext());
for (Function &F : M.functions()) {
bool InterfaceChange = hasFatPointerInterface(F, &StructTM);
@@ -1939,7 +1791,7 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
SmallVector<Function *> Intrinsics;
// Keep one big map so as to memoize constants across functions.
ValueToValueMapTy CloneMap;
- FatPtrConstMaterializer Materializer(&StructTM, CloneMap, &IntTM, DL);
+ FatPtrConstMaterializer Materializer(&StructTM, CloneMap);
ValueMapper LowerInFuncs(CloneMap, RF_None, &StructTM, &Materializer);
for (auto [F, InterfaceChange] : NeedsRemap) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
index 6ec4178053b2..11f0cba47afd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
@@ -17,6 +17,157 @@
using namespace llvm;
+void AMDGPUMIRFormatter::printImm(raw_ostream &OS, const MachineInstr &MI,
+ std::optional<unsigned int> OpIdx, int64_t Imm) const {
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_DELAY_ALU:
+ assert(OpIdx == 0);
+ printSDelayAluImm(Imm, OS);
+ break;
+ default:
+ MIRFormatter::printImm(OS, MI, OpIdx, Imm);
+ break;
+ }
+}
+
+/// Implement target specific parsing of immediate mnemonics. The mnemonic is
+/// a string with a leading dot.
+bool AMDGPUMIRFormatter::parseImmMnemonic(const unsigned OpCode,
+ const unsigned OpIdx,
+ StringRef Src, int64_t &Imm,
+ ErrorCallbackType ErrorCallback) const
+{
+
+ switch (OpCode) {
+ case AMDGPU::S_DELAY_ALU:
+ return parseSDelayAluImmMnemonic(OpIdx, Imm, Src, ErrorCallback);
+ default:
+ break;
+ }
+ return true; // Don't know what this is
+}
+
+void AMDGPUMIRFormatter::printSDelayAluImm(int64_t Imm,
+ llvm::raw_ostream &OS) const {
+ // Construct an immediate string to represent the information encoded in the
+ // s_delay_alu immediate.
+ // .id0_<dep>[_skip_<count>_id1<dep>]
+ constexpr int64_t None = 0;
+ constexpr int64_t Same = 0;
+
+ uint64_t Id0 = (Imm & 0xF);
+ uint64_t Skip = ((Imm >> 4) & 0x7);
+ uint64_t Id1 = ((Imm >> 7) & 0xF);
+ auto Outdep = [&](uint64_t Id) {
+ if (Id == None)
+ OS << "NONE";
+ else if (Id < 5)
+ OS << "VALU_DEP_" << Id;
+ else if (Id < 8)
+ OS << "TRANS32_DEP_" << Id - 4;
+ else
+ OS << "SALU_CYCLE_" << Id - 8;
+ };
+
+ OS << ".id0_";
+ Outdep(Id0);
+
+ // If the second inst is "same" and "none", no need to print the rest of the
+ // string.
+ if (Skip == Same && Id1 == None)
+ return;
+
+ // Encode the second delay specification.
+ OS << "_skip_";
+ if (Skip == 0)
+ OS << "SAME";
+ else if (Skip == 1)
+ OS << "NEXT";
+ else
+ OS << "SKIP_" << Skip - 1;
+
+ OS << "_id1_";
+ Outdep(Id1);
+}
+
+bool AMDGPUMIRFormatter::parseSDelayAluImmMnemonic(
+ const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src,
+ llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const
+{
+ assert(OpIdx == 0);
+
+ Imm = 0;
+ bool Expected = Src.consume_front(".id0_");
+ if (!Expected)
+ return ErrorCallback(Src.begin(), "Expected .id0_");
+
+ auto ExpectInt = [&](StringRef &Src, int64_t Offset) -> int64_t {
+ int64_t Dep;
+ if (!Src.consumeInteger(10, Dep))
+ return Dep + Offset;
+
+ return -1;
+ };
+
+ auto DecodeDelay = [&](StringRef &Src) -> int64_t {
+ if (Src.consume_front("NONE"))
+ return 0;
+ if (Src.consume_front("VALU_DEP_"))
+ return ExpectInt(Src, 0);
+ if (Src.consume_front("TRANS32_DEP_"))
+ return ExpectInt(Src, 4);
+ if (Src.consume_front("SALU_CYCLE_"))
+ return ExpectInt(Src, 8);
+
+ return -1;
+ };
+
+ int64_t Delay0 = DecodeDelay(Src);
+ int64_t Skip = 0;
+ int64_t Delay1 = 0;
+ if (Delay0 == -1)
+ return ErrorCallback(Src.begin(), "Could not decode delay0");
+
+
+ // Set the Imm so far, to that early return has the correct value.
+ Imm = Delay0;
+
+ // If that was the end of the string, the second instruction is "same" and
+ // "none"
+ if (Src.begin() == Src.end())
+ return false;
+
+ Expected = Src.consume_front("_skip_");
+ if (!Expected)
+ return ErrorCallback(Src.begin(), "Expected _skip_");
+
+
+ if (Src.consume_front("SAME")) {
+ Skip = 0;
+ } else if (Src.consume_front("NEXT")) {
+ Skip = 1;
+ } else if (Src.consume_front("SKIP_")) {
+ if (Src.consumeInteger(10, Skip)) {
+ return ErrorCallback(Src.begin(), "Expected integer Skip value");
+ }
+ Skip += 1;
+ } else {
+ ErrorCallback(Src.begin(), "Unexpected Skip Value");
+ }
+
+ Expected = Src.consume_front("_id1_");
+ if (!Expected)
+ return ErrorCallback(Src.begin(), "Expected _id1_");
+
+ Delay1 = DecodeDelay(Src);
+ if (Delay1 == -1)
+ return ErrorCallback(Src.begin(), "Could not decode delay1");
+
+ Imm = Imm | (Skip << 4) | (Delay1 << 7);
+ return false;
+}
+
bool AMDGPUMIRFormatter::parseCustomPseudoSourceValue(
StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
index 98b5031071cf..c5c947375252 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -28,12 +28,35 @@ public:
AMDGPUMIRFormatter() = default;
virtual ~AMDGPUMIRFormatter() = default;
+ /// Implement target specific printing for machine operand immediate value, so
+ /// that we can have more meaningful mnemonic than a 64-bit integer. Passing
+ /// None to OpIdx means the index is unknown.
+ virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
+ std::optional<unsigned> OpIdx,
+ int64_t Imm) const override;
+
+ /// Implement target specific parsing of immediate mnemonics. The mnemonic is
+ /// a string with a leading dot.
+ virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+ StringRef Src, int64_t &Imm,
+ ErrorCallbackType ErrorCallback) const override;
+
/// Implement target specific parsing of target custom pseudo source value.
bool
parseCustomPseudoSourceValue(StringRef Src, MachineFunction &MF,
PerFunctionMIParsingState &PFS,
const PseudoSourceValue *&PSV,
ErrorCallbackType ErrorCallback) const override;
+
+private:
+ /// Print the string to represent s_delay_alu immediate value
+ void printSDelayAluImm(int64_t Imm, llvm::raw_ostream &OS) const;
+
+ /// Parse the immediate pseudo literal for s_delay_alu
+ bool parseSDelayAluImmMnemonic(
+ const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src,
+ llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const;
+
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index f36374b08b34..cfe9f33efc91 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -100,7 +100,7 @@ public:
bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const;
// Combine unsigned buffer load and signed extension instructions to generate
- // signed buffer laod instructions.
+ // signed buffer load instructions.
bool matchCombineSignExtendInReg(
MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const;
void applyCombineSignExtendInReg(
@@ -465,8 +465,8 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
if (!IsOptNone) {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
}
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -494,7 +494,8 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
MachineDominatorTree *MDT =
- IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ IsOptNone ? nullptr
+ : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 3f01a328afaf..4d0cb467ba37 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -238,8 +238,8 @@ void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
if (!IsOptNone) {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
}
AU.addRequired<GISelCSEAnalysisWrapperPass>();
@@ -272,7 +272,8 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &STI = MF.getSubtarget<GCNSubtarget>();
MachineDominatorTree *MDT =
- IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ IsOptNone ? nullptr
+ : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize());
AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 35abd6eddde8..74f0540239c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -421,8 +421,8 @@ void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
if (!IsOptNone) {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
}
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -449,7 +449,8 @@ bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) {
const auto *LI = ST.getLegalizerInfo();
MachineDominatorTree *MDT =
- IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ IsOptNone ? nullptr
+ : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
index 2ea03ddb1fcc..d1985f46b1c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
@@ -33,7 +33,7 @@ StringRef AMDGPURegBankSelect::getPassName() const {
void AMDGPURegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<MachineCycleInfoWrapperPass>();
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
// TODO: Preserve DomTree
RegBankSelect::getAnalysisUsage(AU);
}
@@ -41,7 +41,7 @@ void AMDGPURegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
INITIALIZE_PASS_BEGIN(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE,
"AMDGPU Register Bank Select", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE,
"AMDGPU Register Bank Select", false, false)
@@ -63,7 +63,8 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
MachineCycleInfo &CycleInfo =
getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
- MachineDominatorTree &DomTree = getAnalysis<MachineDominatorTree>();
+ MachineDominatorTree &DomTree =
+ getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
MachineUniformityInfo Uniformity =
computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 7ebd674757fb..9e7694f41d6b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3079,7 +3079,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
- case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
applyDefaultMapping(OpdMapper);
@@ -4376,7 +4375,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
- case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
// vdata_out
@@ -4907,8 +4905,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_load_tr_b128:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
- case Intrinsic::amdgcn_ds_ordered_swap:
- case Intrinsic::amdgcn_ds_fadd_v2bf16: {
+ case Intrinsic::amdgcn_ds_ordered_swap: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
@@ -5221,11 +5218,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_UMAX:
case AMDGPU::G_ATOMICRMW_UMIN:
case AMDGPU::G_ATOMICRMW_FADD:
+ case AMDGPU::G_ATOMICRMW_FMIN:
+ case AMDGPU::G_ATOMICRMW_FMAX:
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
- case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
- case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
- case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
+ case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 410dc83d45c5..ed5bae3e4ff6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -252,21 +252,8 @@ def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
-def : SourceOfDivergence<int_amdgcn_ds_fadd>;
def : SourceOfDivergence<int_amdgcn_ds_fmin>;
def : SourceOfDivergence<int_amdgcn_ds_fmax>;
-def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_smin>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_umin>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_smax>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_umax>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_and>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_or>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_xor>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>;
@@ -280,7 +267,6 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
-def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
@@ -298,7 +284,6 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>;
-def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>;
@@ -316,7 +301,6 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
-def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
@@ -334,12 +318,10 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>;
-def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_live_mask>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index 2449fa581842..3e5d83b8e3fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -15,10 +15,9 @@
/// SplitModule: load-balance the module's functions across a set of N
/// partitions to allow parallel codegen. However, it does it very
/// differently than the target-agnostic variant:
-/// - Kernels are used as the module's "roots".
-/// They're known entry points on AMDGPU, and everything else is often
-/// internal only.
-/// - Each kernel has a set of dependencies, and when a kernel and its
+/// - The module has "split roots", which are kernels in the vast
+// majority of cases.
+/// - Each root has a set of dependencies, and when a root and its
/// dependencies is considered "big", we try to put it in a partition where
/// most dependencies are already imported, to avoid duplicating large
/// amounts of code.
@@ -67,20 +66,22 @@ using namespace llvm;
namespace {
-static cl::opt<float> LargeKernelFactor(
- "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f),
+static cl::opt<float> LargeFnFactor(
+ "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f),
cl::Hidden,
cl::desc(
- "consider a kernel as large and needing special treatment when it "
+ "consider a function as large and needing special treatment when the "
+ "cost of importing it into a partition"
"exceeds the average cost of a partition by this factor; e;g. 2.0 "
- "means if the kernel and its dependencies is 2 times bigger than "
- "an average partition; 0 disables large kernels handling entirely"));
+ "means if the function and its dependencies is 2 times bigger than "
+ "an average partition; 0 disables large functions handling entirely"));
-static cl::opt<float> LargeKernelOverlapForMerge(
- "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f),
+static cl::opt<float> LargeFnOverlapForMerge(
+ "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f),
cl::Hidden,
- cl::desc("defines how much overlap between two large kernel's dependencies "
- "is needed to put them in the same partition"));
+ cl::desc(
+ "defines how much overlap between two large function's dependencies "
+ "is needed to put them in the same partition"));
static cl::opt<bool> NoExternalizeGlobals(
"amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
@@ -98,6 +99,7 @@ static cl::opt<bool>
using CostType = InstructionCost::CostType;
using PartitionID = unsigned;
+using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
static bool isEntryPoint(const Function *F) {
return AMDGPU::isEntryFunctionCC(F->getCallingConv());
@@ -214,13 +216,12 @@ static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
/// Calculate the cost of each function in \p M
/// \param SML Log Helper
-/// \param TM TargetMachine instance used to retrieve TargetTransformInfo.
+/// \param GetTTI Abstract getter for TargetTransformInfo.
/// \param M Module to analyze.
/// \param CostMap[out] Resulting Function -> Cost map.
/// \return The module's total cost.
static CostType
-calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM,
- Module &M,
+calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M,
DenseMap<const Function *, CostType> &CostMap) {
CostType ModuleCost = 0;
CostType KernelCost = 0;
@@ -230,8 +231,7 @@ calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM,
continue;
CostType FnCost = 0;
- TargetTransformInfo TTI = TM.getTargetTransformInfo(Fn);
-
+ const auto &TTI = GetTTI(Fn);
for (const auto &BB : Fn) {
for (const auto &I : BB) {
auto Cost =
@@ -277,9 +277,9 @@ static bool canBeIndirectlyCalled(const Function &F) {
/*IgnoreCastedDirectCall=*/true);
}
-/// When a kernel or any of its callees performs an indirect call, this function
+/// When a function or any of its callees performs an indirect call, this
/// takes over \ref addAllDependencies and adds all potentially callable
-/// functions to \p Fns so they can be counted as dependencies of the kernel.
+/// functions to \p Fns so they can be counted as dependencies of the function.
///
/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
/// presence of an indirect call, the function's resource usage is the same as
@@ -301,13 +301,14 @@ static void addAllIndirectCallDependencies(const Module &M,
/// \param CG Call graph for \p Fn's module.
/// \param Fn Current function to look at.
/// \param Fns[out] Resulting list of functions.
+/// \param OnlyDirect Whether to only consider direct callees.
/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
/// point, either in \p Fn or in one of the function it calls. When that
/// happens, we fall back to adding all callable functions inside \p Fn's module
/// to \p Fns.
static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
const Function &Fn,
- DenseSet<const Function *> &Fns,
+ DenseSet<const Function *> &Fns, bool OnlyDirect,
bool &HadIndirectCall) {
assert(!Fn.isDeclaration());
@@ -325,6 +326,9 @@ static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
auto *CGNode = CGEntry.second;
auto *Callee = CGNode->getFunction();
if (!Callee) {
+ if (OnlyDirect)
+ continue;
+
// Functions have an edge towards CallsExternalNode if they're external
// declarations, or if they do an indirect call. As we only process
// definitions here, we know this means the function has an indirect
@@ -353,13 +357,19 @@ static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
}
}
-/// Contains information about a kernel and its dependencies.
-struct KernelWithDependencies {
- KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
- const DenseMap<const Function *, CostType> &FnCosts,
- const Function *Fn)
+/// Contains information about a function and its dependencies.
+/// This is a splitting root. The splitting algorithm works by
+/// assigning these to partitions.
+struct FunctionWithDependencies {
+ FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
+ const DenseMap<const Function *, CostType> &FnCosts,
+ const Function *Fn)
: Fn(Fn) {
- addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall);
+ // When Fn is not a kernel, we don't need to collect indirect callees.
+ // Resource usage analysis is only performed on kernels, and we collect
+ // indirect callees for resource usage analysis.
+ addAllDependencies(SML, CG, *Fn, Dependencies,
+ /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall);
TotalCost = FnCosts.at(Fn);
for (const auto *Dep : Dependencies) {
TotalCost += FnCosts.at(Dep);
@@ -380,8 +390,8 @@ struct KernelWithDependencies {
CostType TotalCost = 0;
- /// \returns true if this kernel and its dependencies can be considered large
- /// according to \p Threshold.
+ /// \returns true if this function and its dependencies can be considered
+ /// large according to \p Threshold.
bool isLarge(CostType Threshold) const {
return TotalCost > Threshold && !Dependencies.empty();
}
@@ -420,39 +430,39 @@ static float calculateOverlap(const DenseSet<const Function *> &A,
/// \param NumParts Number of partitions to create.
/// \param ModuleCost Total cost of all functions in \p M.
/// \param FnCosts Map of Function -> Cost
-/// \param WorkList Kernels and their dependencies to process in order.
+/// \param WorkList Functions and their dependencies to process in order.
/// \returns The created partitions (a vector of size \p NumParts )
static std::vector<DenseSet<const Function *>>
doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
CostType ModuleCost,
const DenseMap<const Function *, CostType> &FnCosts,
- const SmallVector<KernelWithDependencies> &WorkList) {
+ const SmallVector<FunctionWithDependencies> &WorkList) {
SML << "\n--Partitioning Starts--\n";
- // Calculate a "large kernel threshold". When more than one kernel's total
- // import cost exceeds this value, we will try to merge it with other,
- // similarly large kernels.
+ // Calculate a "large function threshold". When more than one function's total
+ // import cost exceeds this value, we will try to assign it to an existing
+ // partition to reduce the amount of duplication needed.
//
- // e.g. let two kernels X and Y have a import cost of ~10% of the module, we
+ // e.g. let two functions X and Y have a import cost of ~10% of the module, we
// assign X to a partition as usual, but when we get to Y, we check if it's
// worth also putting it in Y's partition.
- const CostType LargeKernelThreshold =
- LargeKernelFactor ? CostType(((ModuleCost / NumParts) * LargeKernelFactor))
- : std::numeric_limits<CostType>::max();
+ const CostType LargeFnThreshold =
+ LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor))
+ : std::numeric_limits<CostType>::max();
std::vector<DenseSet<const Function *>> Partitions;
Partitions.resize(NumParts);
- // Assign a partition to each kernel, and try to keep the partitions more or
+ // Assign functions to partitions, and try to keep the partitions more or
// less balanced. We do that through a priority queue sorted in reverse, so we
// can always look at the partition with the least content.
//
// There are some cases where we will be deliberately unbalanced though.
- // - Large kernels: we try to merge with existing partitions to reduce code
+ // - Large functions: we try to merge with existing partitions to reduce code
// duplication.
- // - Kernels with indirect or external calls always go in the first partition
- // (P0).
+ // - Functions with indirect or external calls always go in the first
+ // partition (P0).
auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
const std::pair<PartitionID, CostType> &b) {
// When two partitions have the same cost, assign to the one with the
@@ -471,17 +481,17 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
for (unsigned I = 0; I < NumParts; ++I)
BalancingQueue.push_back(std::make_pair(I, 0));
- // Helper function to handle assigning a kernel to a partition. This takes
+ // Helper function to handle assigning a function to a partition. This takes
// care of updating the balancing queue.
const auto AssignToPartition = [&](PartitionID PID,
- const KernelWithDependencies &KWD) {
+ const FunctionWithDependencies &FWD) {
auto &FnsInPart = Partitions[PID];
- FnsInPart.insert(KWD.Fn);
- FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end());
+ FnsInPart.insert(FWD.Fn);
+ FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
- SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n -> ";
- if (!KWD.Dependencies.empty()) {
- SML << KWD.Dependencies.size() << " dependencies added\n";
+ SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n -> ";
+ if (!FWD.Dependencies.empty()) {
+ SML << FWD.Dependencies.size() << " dependencies added\n";
};
// Update the balancing queue. we scan backwards because in the common case
@@ -506,44 +516,43 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
sort(BalancingQueue, ComparePartitions);
};
- for (auto &CurKernel : WorkList) {
- // When a kernel has indirect calls, it must stay in the first partition
+ for (auto &CurFn : WorkList) {
+ // When a function has indirect calls, it must stay in the first partition
// alongside every reachable non-entry function. This is a nightmare case
// for splitting as it severely limits what we can do.
- if (CurKernel.HasIndirectCall) {
- SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn)
+ if (CurFn.HasIndirectCall) {
+ SML << "Function with indirect call(s): " << getName(*CurFn.Fn)
<< " defaulting to P0\n";
- AssignToPartition(0, CurKernel);
+ AssignToPartition(0, CurFn);
continue;
}
- // When a kernel has non duplicatable dependencies, we have to keep it in
+ // When a function has non duplicatable dependencies, we have to keep it in
// the first partition as well. This is a conservative approach, a
// finer-grained approach could keep track of which dependencies are
// non-duplicatable exactly and just make sure they're grouped together.
- if (CurKernel.HasNonDuplicatableDependecy) {
- SML << "Kernel with externally visible dependency "
- << getName(*CurKernel.Fn) << " defaulting to P0\n";
- AssignToPartition(0, CurKernel);
+ if (CurFn.HasNonDuplicatableDependecy) {
+ SML << "Function with externally visible dependency "
+ << getName(*CurFn.Fn) << " defaulting to P0\n";
+ AssignToPartition(0, CurFn);
continue;
}
- // Be smart with large kernels to avoid duplicating their dependencies.
- if (CurKernel.isLarge(LargeKernelThreshold)) {
- assert(LargeKernelOverlapForMerge >= 0.0f &&
- LargeKernelOverlapForMerge <= 1.0f);
- SML << "Large Kernel: " << getName(*CurKernel.Fn)
+ // Be smart with large functions to avoid duplicating their dependencies.
+ if (CurFn.isLarge(LargeFnThreshold)) {
+ assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f);
+ SML << "Large Function: " << getName(*CurFn.Fn)
<< " - looking for partition with at least "
- << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n";
+ << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n";
bool Assigned = false;
for (const auto &[PID, Fns] : enumerate(Partitions)) {
- float Overlap = calculateOverlap(CurKernel.Dependencies, Fns);
+ float Overlap = calculateOverlap(CurFn.Dependencies, Fns);
SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P"
<< PID << '\n';
- if (Overlap > LargeKernelOverlapForMerge) {
+ if (Overlap > LargeFnOverlapForMerge) {
SML << " selecting P" << PID << '\n';
- AssignToPartition(PID, CurKernel);
+ AssignToPartition(PID, CurFn);
Assigned = true;
}
}
@@ -554,41 +563,34 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
// Normal "load-balancing", assign to partition with least pressure.
auto [PID, CurCost] = BalancingQueue.back();
- AssignToPartition(PID, CurKernel);
+ AssignToPartition(PID, CurFn);
}
- // Work is mostly done now, verify the partioning and add all functions we may
- // have missed (= unreachable, or we don't understand how they're reached) to
- // P0.
- DenseSet<const Function *> AllFunctions;
- for (const auto &[Idx, Part] : enumerate(Partitions)) {
- CostType Cost = 0;
- for (auto *Fn : Part) {
- // external linkage functions should exclusively be in the first partition
- // at this stage. In theory, we should only ever see external linkage
- // functions here if they're kernels, or if they've been added due to a
- // kernel using indirect calls somewhere in its CallGraph.
- assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn)));
- Cost += FnCosts.at(Fn);
+ if (SML) {
+ for (const auto &[Idx, Part] : enumerate(Partitions)) {
+ CostType Cost = 0;
+ for (auto *Fn : Part)
+ Cost += FnCosts.at(Fn);
+ SML << "P" << Idx << " has a total cost of " << Cost << " ("
+ << format("%0.2f", (float(Cost) / ModuleCost) * 100)
+ << "% of source module)\n";
}
- SML << "P" << Idx << " has a total cost of " << Cost << " ("
- << format("%0.2f", (float(Cost) / ModuleCost) * 100)
- << "% of source module)\n";
- AllFunctions.insert(Part.begin(), Part.end());
+
+ SML << "--Partitioning Done--\n\n";
}
- // Add missed functions to P0. This will take care of adding things like
- // external functions with no callers in the module to P0. This should be
- // fairly rare as AMDGPU internalizes everything in most cases, so unused
- // internal functions would get removed.
+ // Check no functions were missed.
+#ifndef NDEBUG
+ DenseSet<const Function *> AllFunctions;
+ for (const auto &Part : Partitions)
+ AllFunctions.insert(Part.begin(), Part.end());
+
for (auto &Fn : M) {
if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
- SML << getName(Fn) << " has no partition assigned, defaulting to P0\n";
- Partitions[0].insert(&Fn);
+ assert(AllFunctions.contains(&Fn) && "Missed a function?!");
}
}
-
- SML << "--Partitioning Done--\n\n";
+#endif
return Partitions;
}
@@ -604,10 +606,17 @@ static void externalize(GlobalValue &GV) {
if (!GV.hasName())
GV.setName("__llvmsplit_unnamed");
}
-} // end anonymous namespace
-void llvm::splitAMDGPUModule(
- const AMDGPUTargetMachine &TM, Module &M, unsigned N,
+static bool hasDirectCaller(const Function &Fn) {
+ for (auto &U : Fn.uses()) {
+ if (auto *CB = dyn_cast<CallBase>(U.getUser()); CB && CB->isCallee(&U))
+ return true;
+ }
+ return false;
+}
+
+static void splitAMDGPUModule(
+ GetTTIFn GetTTI, Module &M, unsigned N,
function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
SplitModuleLogger SML(M);
@@ -648,15 +657,36 @@ void llvm::splitAMDGPUModule(
// Start by calculating the cost of every function in the module, as well as
// the module's overall cost.
DenseMap<const Function *, CostType> FnCosts;
- const CostType ModuleCost = calculateFunctionCosts(SML, TM, M, FnCosts);
+ const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
- // Gather every kernel into a WorkList, then sort it by descending total cost
- // of the kernel so the biggest kernels are seen first.
- SmallVector<KernelWithDependencies> WorkList;
+ // First, gather ever kernel into the worklist.
+ SmallVector<FunctionWithDependencies> WorkList;
for (auto &Fn : M) {
if (isEntryPoint(&Fn) && !Fn.isDeclaration())
WorkList.emplace_back(SML, CG, FnCosts, &Fn);
}
+
+ // Then, find missing functions that need to be considered as additional
+ // roots. These can't be called in theory, but in practice we still have to
+ // handle them to avoid linker errors.
+ {
+ DenseSet<const Function *> SeenFunctions;
+ for (const auto &FWD : WorkList) {
+ SeenFunctions.insert(FWD.Fn);
+ SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
+ }
+
+ for (auto &Fn : M) {
+ // If this function is not part of any kernel's dependencies and isn't
+ // directly called, consider it as a root.
+ if (!Fn.isDeclaration() && !isEntryPoint(&Fn) &&
+ !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) {
+ WorkList.emplace_back(SML, CG, FnCosts, &Fn);
+ }
+ }
+ }
+
+ // Sort the worklist so the most expensive roots are seen first.
sort(WorkList, [&](auto &A, auto &B) {
// Sort by total cost, and if the total cost is identical, sort
// alphabetically.
@@ -667,13 +697,20 @@ void llvm::splitAMDGPUModule(
if (SML) {
SML << "Worklist\n";
- for (const auto &KWD : WorkList) {
- SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost
- << " indirect:" << KWD.HasIndirectCall
- << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy
+ for (const auto &FWD : WorkList) {
+ SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost
+ << " indirect:" << FWD.HasIndirectCall
+ << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy
<< ")\n";
- for (const auto *Dep : KWD.Dependencies)
- SML << " [Dep] " << getName(*Dep) << '\n';
+ // Sort function names before printing to ensure determinism.
+ SmallVector<std::string> SortedDepNames;
+ SortedDepNames.reserve(FWD.Dependencies.size());
+ for (const auto *Dep : FWD.Dependencies)
+ SortedDepNames.push_back(getName(*Dep));
+ sort(SortedDepNames);
+
+ for (const auto &Name : SortedDepNames)
+ SML << " [dependency] " << Name << '\n';
}
}
@@ -700,16 +737,8 @@ void llvm::splitAMDGPUModule(
std::unique_ptr<Module> MPart(
CloneModule(M, VMap, [&](const GlobalValue *GV) {
// Functions go in their assigned partition.
- if (const auto *Fn = dyn_cast<Function>(GV)) {
-// Check we don't import an external linkage function in any
-// partition other than P0.
-#ifndef NDEBUG
- if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) {
- assert((I == 0) == FnsInPart.contains(Fn));
- }
-#endif
+ if (const auto *Fn = dyn_cast<Function>(GV))
return FnsInPart.contains(Fn);
- }
if (NeedsConservativeImport(GV))
return true;
@@ -742,3 +771,16 @@ void llvm::splitAMDGPUModule(
<< format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
<< "% of original module)\n";
}
+} // namespace
+
+PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+ FunctionAnalysisManager &FAM =
+ MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
+ return FAM.getResult<TargetIRAnalysis>(F);
+ };
+ splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
+ // We don't change the original module.
+ return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
index 6171643bd4ad..d814dedd6f0c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
@@ -12,18 +12,27 @@
#define LLVM_TARGET_AMDGPUSPLITMODULE_H
#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/IR/PassManager.h"
#include <memory>
namespace llvm {
-class Module;
-class AMDGPUTargetMachine;
-
/// Splits the module M into N linkable partitions. The function ModuleCallback
/// is called N times passing each individual partition as the MPart argument.
-void splitAMDGPUModule(
- const AMDGPUTargetMachine &TM, Module &M, unsigned N,
- function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback);
+class AMDGPUSplitModulePass : public PassInfoMixin<AMDGPUSplitModulePass> {
+public:
+ using ModuleCreationCallback =
+ function_ref<void(std::unique_ptr<Module> MPart)>;
+
+ AMDGPUSplitModulePass(unsigned N, ModuleCreationCallback ModuleCallback)
+ : N(N), ModuleCallback(ModuleCallback) {}
+
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+
+private:
+ unsigned N;
+ ModuleCreationCallback ModuleCallback;
+};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 0751c8dc8b8b..a8e26f104f58 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1104,6 +1104,9 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
if (hasFlatScratchInit())
NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
+
+ if (hasPrivateSegmentSize())
+ NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
}
void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ce997c659094..9162e110aa10 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -658,8 +658,7 @@ Error AMDGPUTargetMachine::buildCodeGenPipeline(
return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
}
-void AMDGPUTargetMachine::registerPassBuilderCallbacks(
- PassBuilder &PB, bool PopulateClassToPassNames) {
+void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
#include "llvm/Passes/TargetPassRegistry.inc"
@@ -829,8 +828,24 @@ AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
bool AMDGPUTargetMachine::splitModule(
Module &M, unsigned NumParts,
- function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) const {
- splitAMDGPUModule(*this, M, NumParts, ModuleCallback);
+ function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
+ // FIXME(?): Would be better to use an already existing Analysis/PassManager,
+ // but all current users of this API don't have one ready and would need to
+ // create one anyway. Let's hide the boilerplate for now to keep it simple.
+
+ LoopAnalysisManager LAM;
+ FunctionAnalysisManager FAM;
+ CGSCCAnalysisManager CGAM;
+ ModuleAnalysisManager MAM;
+
+ PassBuilder PB(this);
+ PB.registerModuleAnalyses(MAM);
+ PB.registerFunctionAnalyses(FAM);
+ PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+ ModulePassManager MPM;
+ MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback));
+ MPM.run(M, MAM);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 2cfd232483a8..0f74fbc22fa8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -58,8 +58,7 @@ public:
const CGPassBuilderOption &Opts,
PassInstrumentationCallbacks *PIC) override;
- void registerPassBuilderCallbacks(PassBuilder &PB,
- bool PopulateClassToPassNames) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
void registerDefaultAliasAnalyses(AAManager &) override;
/// Get the integer value of a null pointer in the given address space.
@@ -76,7 +75,7 @@ public:
bool splitModule(Module &M, unsigned NumParts,
function_ref<void(std::unique_ptr<Module> MPart)>
- ModuleCallback) const override;
+ ModuleCallback) override;
};
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 437e01c37c6b..1192b49fd1f0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -502,7 +502,6 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
switch (Inst->getIntrinsicID()) {
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
@@ -1019,7 +1018,6 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const {
switch (IID) {
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_is_shared:
@@ -1041,7 +1039,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Value *NewV) const {
auto IntrID = II->getIntrinsicID();
switch (IntrID) {
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index bdb5a8d9a0a0..b08957d22ee7 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1314,6 +1314,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
/// }
private:
+ void createConstantSymbol(StringRef Id, int64_t Val);
+
bool ParseAsAbsoluteExpression(uint32_t &Ret);
bool OutOfRangeError(SMRange Range);
/// Calculate VGPR/SGPR blocks required for given target, reserved
@@ -1331,12 +1333,12 @@ private:
/// \param SGPRRange [in] Token range, used for SGPR diagnostics.
/// \param VGPRBlocks [out] Result VGPR block count.
/// \param SGPRBlocks [out] Result SGPR block count.
- bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed,
- bool FlatScrUsed, bool XNACKUsed,
+ bool calculateGPRBlocks(const FeatureBitset &Features, const MCExpr *VCCUsed,
+ const MCExpr *FlatScrUsed, bool XNACKUsed,
std::optional<bool> EnableWavefrontSize32,
- unsigned NextFreeVGPR, SMRange VGPRRange,
- unsigned NextFreeSGPR, SMRange SGPRRange,
- unsigned &VGPRBlocks, unsigned &SGPRBlocks);
+ const MCExpr *NextFreeVGPR, SMRange VGPRRange,
+ const MCExpr *NextFreeSGPR, SMRange SGPRRange,
+ const MCExpr *&VGPRBlocks, const MCExpr *&SGPRBlocks);
bool ParseDirectiveAMDGCNTarget();
bool ParseDirectiveAMDHSACodeObjectVersion();
bool ParseDirectiveAMDHSAKernel();
@@ -1408,36 +1410,28 @@ public:
setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits()));
- {
- // TODO: make those pre-defined variables read-only.
- // Currently there is none suitable machinery in the core llvm-mc for this.
- // MCSymbol::isRedefinable is intended for another purpose, and
- // AsmParser::parseDirectiveSet() cannot be specialized for specific target.
- AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
- MCContext &Ctx = getContext();
- if (ISA.Major >= 6 && isHsaAbi(getSTI())) {
- MCSymbol *Sym =
- Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
- Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_minor"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
- Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_stepping"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
- } else {
- MCSymbol *Sym =
- Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
- Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
- Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
- }
- if (ISA.Major >= 6 && isHsaAbi(getSTI())) {
- initializeGprCountSymbol(IS_VGPR);
- initializeGprCountSymbol(IS_SGPR);
- } else
- KernelScope.initialize(getContext());
+ AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
+ if (ISA.Major >= 6 && isHsaAbi(getSTI())) {
+ createConstantSymbol(".amdgcn.gfx_generation_number", ISA.Major);
+ createConstantSymbol(".amdgcn.gfx_generation_minor", ISA.Minor);
+ createConstantSymbol(".amdgcn.gfx_generation_stepping", ISA.Stepping);
+ } else {
+ createConstantSymbol(".option.machine_version_major", ISA.Major);
+ createConstantSymbol(".option.machine_version_minor", ISA.Minor);
+ createConstantSymbol(".option.machine_version_stepping", ISA.Stepping);
}
+ if (ISA.Major >= 6 && isHsaAbi(getSTI())) {
+ initializeGprCountSymbol(IS_VGPR);
+ initializeGprCountSymbol(IS_SGPR);
+ } else
+ KernelScope.initialize(getContext());
+
+ for (auto [Symbol, Code] : AMDGPU::UCVersion::getGFXVersions())
+ createConstantSymbol(Symbol, Code);
+
+ createConstantSymbol("UC_VERSION_W64_BIT", 0x2000);
+ createConstantSymbol("UC_VERSION_W32_BIT", 0x4000);
+ createConstantSymbol("UC_VERSION_MDP_BIT", 0x8000);
}
bool hasMIMG_R128() const {
@@ -2486,6 +2480,16 @@ bool AMDGPUOperand::isInlineValue() const {
// AsmParser
//===----------------------------------------------------------------------===//
+void AMDGPUAsmParser::createConstantSymbol(StringRef Id, int64_t Val) {
+ // TODO: make those pre-defined variables read-only.
+ // Currently there is none suitable machinery in the core llvm-mc for this.
+ // MCSymbol::isRedefinable is intended for another purpose, and
+ // AsmParser::parseDirectiveSet() cannot be specialized for specific target.
+ MCContext &Ctx = getContext();
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(Id);
+ Sym->setVariableValue(MCConstantExpr::create(Val, Ctx));
+}
+
static int getRegClass(RegisterKind Is, unsigned RegWidth) {
if (Is == IS_VGPR) {
switch (RegWidth) {
@@ -5352,41 +5356,64 @@ bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) {
}
bool AMDGPUAsmParser::calculateGPRBlocks(
- const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed,
- bool XNACKUsed, std::optional<bool> EnableWavefrontSize32,
- unsigned NextFreeVGPR, SMRange VGPRRange, unsigned NextFreeSGPR,
- SMRange SGPRRange, unsigned &VGPRBlocks, unsigned &SGPRBlocks) {
+ const FeatureBitset &Features, const MCExpr *VCCUsed,
+ const MCExpr *FlatScrUsed, bool XNACKUsed,
+ std::optional<bool> EnableWavefrontSize32, const MCExpr *NextFreeVGPR,
+ SMRange VGPRRange, const MCExpr *NextFreeSGPR, SMRange SGPRRange,
+ const MCExpr *&VGPRBlocks, const MCExpr *&SGPRBlocks) {
// TODO(scott.linder): These calculations are duplicated from
// AMDGPUAsmPrinter::getSIProgramInfo and could be unified.
IsaVersion Version = getIsaVersion(getSTI().getCPU());
+ MCContext &Ctx = getContext();
- unsigned NumVGPRs = NextFreeVGPR;
- unsigned NumSGPRs = NextFreeSGPR;
+ const MCExpr *NumSGPRs = NextFreeSGPR;
+ int64_t EvaluatedSGPRs;
if (Version.Major >= 10)
- NumSGPRs = 0;
+ NumSGPRs = MCConstantExpr::create(0, Ctx);
else {
unsigned MaxAddressableNumSGPRs =
IsaInfo::getAddressableNumSGPRs(&getSTI());
- if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) &&
- NumSGPRs > MaxAddressableNumSGPRs)
+ if (NumSGPRs->evaluateAsAbsolute(EvaluatedSGPRs) && Version.Major >= 8 &&
+ !Features.test(FeatureSGPRInitBug) &&
+ static_cast<uint64_t>(EvaluatedSGPRs) > MaxAddressableNumSGPRs)
return OutOfRangeError(SGPRRange);
- NumSGPRs +=
- IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed);
+ const MCExpr *ExtraSGPRs =
+ AMDGPUMCExpr::createExtraSGPRs(VCCUsed, FlatScrUsed, XNACKUsed, Ctx);
+ NumSGPRs = MCBinaryExpr::createAdd(NumSGPRs, ExtraSGPRs, Ctx);
- if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
- NumSGPRs > MaxAddressableNumSGPRs)
+ if (NumSGPRs->evaluateAsAbsolute(EvaluatedSGPRs) &&
+ (Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
+ static_cast<uint64_t>(EvaluatedSGPRs) > MaxAddressableNumSGPRs)
return OutOfRangeError(SGPRRange);
if (Features.test(FeatureSGPRInitBug))
- NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
- }
+ NumSGPRs =
+ MCConstantExpr::create(IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG, Ctx);
+ }
+
+ // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
+ // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
+ auto GetNumGPRBlocks = [&Ctx](const MCExpr *NumGPR,
+ unsigned Granule) -> const MCExpr * {
+ const MCExpr *OneConst = MCConstantExpr::create(1ul, Ctx);
+ const MCExpr *GranuleConst = MCConstantExpr::create(Granule, Ctx);
+ const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
+ const MCExpr *AlignToGPR =
+ AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
+ const MCExpr *DivGPR =
+ MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
+ const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
+ return SubGPR;
+ };
- VGPRBlocks = IsaInfo::getEncodedNumVGPRBlocks(&getSTI(), NumVGPRs,
- EnableWavefrontSize32);
- SGPRBlocks = IsaInfo::getNumSGPRBlocks(&getSTI(), NumSGPRs);
+ VGPRBlocks = GetNumGPRBlocks(
+ NextFreeVGPR,
+ IsaInfo::getVGPREncodingGranule(&getSTI(), EnableWavefrontSize32));
+ SGPRBlocks =
+ GetNumGPRBlocks(NumSGPRs, IsaInfo::getSGPREncodingGranule(&getSTI()));
return false;
}
@@ -5410,14 +5437,17 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
IsaVersion IVersion = getIsaVersion(getSTI().getCPU());
+ const MCExpr *ZeroExpr = MCConstantExpr::create(0, getContext());
+ const MCExpr *OneExpr = MCConstantExpr::create(1, getContext());
+
SMRange VGPRRange;
- uint64_t NextFreeVGPR = 0;
- uint64_t AccumOffset = 0;
+ const MCExpr *NextFreeVGPR = ZeroExpr;
+ const MCExpr *AccumOffset = MCConstantExpr::create(0, getContext());
uint64_t SharedVGPRCount = 0;
uint64_t PreloadLength = 0;
uint64_t PreloadOffset = 0;
SMRange SGPRRange;
- uint64_t NextFreeSGPR = 0;
+ const MCExpr *NextFreeSGPR = ZeroExpr;
// Count the number of user SGPRs implied from the enabled feature bits.
unsigned ImpliedUserSGPRCount = 0;
@@ -5425,8 +5455,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
// Track if the asm explicitly contains the directive for the user SGPR
// count.
std::optional<unsigned> ExplicitUserSGPRCount;
- bool ReserveVCC = true;
- bool ReserveFlatScr = true;
+ const MCExpr *ReserveVCC = OneExpr;
+ const MCExpr *ReserveFlatScr = OneExpr;
std::optional<bool> EnableWavefrontSize32;
while (true) {
@@ -5620,34 +5650,29 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID, ExprVal,
ValRange);
} else if (ID == ".amdhsa_next_free_vgpr") {
- EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
VGPRRange = ValRange;
- NextFreeVGPR = Val;
+ NextFreeVGPR = ExprVal;
} else if (ID == ".amdhsa_next_free_sgpr") {
- EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
SGPRRange = ValRange;
- NextFreeSGPR = Val;
+ NextFreeSGPR = ExprVal;
} else if (ID == ".amdhsa_accum_offset") {
if (!isGFX90A())
return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
- EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
- AccumOffset = Val;
+ AccumOffset = ExprVal;
} else if (ID == ".amdhsa_reserve_vcc") {
- EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
- if (!isUInt<1>(Val))
+ if (EvaluatableExpr && !isUInt<1>(Val))
return OutOfRangeError(ValRange);
- ReserveVCC = Val;
+ ReserveVCC = ExprVal;
} else if (ID == ".amdhsa_reserve_flat_scratch") {
- EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
if (IVersion.Major < 7)
return Error(IDRange.Start, "directive requires gfx7+", IDRange);
if (hasArchitectedFlatScratch())
return Error(IDRange.Start,
"directive is not supported with architected flat scratch",
IDRange);
- if (!isUInt<1>(Val))
+ if (EvaluatableExpr && !isUInt<1>(Val))
return OutOfRangeError(ValRange);
- ReserveFlatScr = Val;
+ ReserveFlatScr = ExprVal;
} else if (ID == ".amdhsa_reserve_xnack_mask") {
if (IVersion.Major < 8)
return Error(IDRange.Start, "directive requires gfx8+", IDRange);
@@ -5771,8 +5796,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (!Seen.contains(".amdhsa_next_free_sgpr"))
return TokError(".amdhsa_next_free_sgpr directive is required");
- unsigned VGPRBlocks;
- unsigned SGPRBlocks;
+ const MCExpr *VGPRBlocks;
+ const MCExpr *SGPRBlocks;
if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
getTargetStreamer().getTargetID()->isXnackOnOrAny(),
EnableWavefrontSize32, NextFreeVGPR,
@@ -5780,19 +5805,26 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
SGPRBlocks))
return true;
- if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>(
- VGPRBlocks))
+ int64_t EvaluatedVGPRBlocks;
+ bool VGPRBlocksEvaluatable =
+ VGPRBlocks->evaluateAsAbsolute(EvaluatedVGPRBlocks);
+ if (VGPRBlocksEvaluatable &&
+ !isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>(
+ static_cast<uint64_t>(EvaluatedVGPRBlocks))) {
return OutOfRangeError(VGPRRange);
+ }
AMDGPU::MCKernelDescriptor::bits_set(
- KD.compute_pgm_rsrc1, MCConstantExpr::create(VGPRBlocks, getContext()),
+ KD.compute_pgm_rsrc1, VGPRBlocks,
COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT,
COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, getContext());
- if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>(
- SGPRBlocks))
+ int64_t EvaluatedSGPRBlocks;
+ if (SGPRBlocks->evaluateAsAbsolute(EvaluatedSGPRBlocks) &&
+ !isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>(
+ static_cast<uint64_t>(EvaluatedSGPRBlocks)))
return OutOfRangeError(SGPRRange);
AMDGPU::MCKernelDescriptor::bits_set(
- KD.compute_pgm_rsrc1, MCConstantExpr::create(SGPRBlocks, getContext()),
+ KD.compute_pgm_rsrc1, SGPRBlocks,
COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT,
COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT, getContext());
@@ -5822,16 +5854,28 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (isGFX90A()) {
if (!Seen.contains(".amdhsa_accum_offset"))
return TokError(".amdhsa_accum_offset directive is required");
- if (AccumOffset < 4 || AccumOffset > 256 || (AccumOffset & 3))
+ int64_t EvaluatedAccum;
+ bool AccumEvaluatable = AccumOffset->evaluateAsAbsolute(EvaluatedAccum);
+ uint64_t UEvaluatedAccum = EvaluatedAccum;
+ if (AccumEvaluatable &&
+ (UEvaluatedAccum < 4 || UEvaluatedAccum > 256 || (UEvaluatedAccum & 3)))
return TokError("accum_offset should be in range [4..256] in "
"increments of 4");
- if (AccumOffset > alignTo(std::max((uint64_t)1, NextFreeVGPR), 4))
+
+ int64_t EvaluatedNumVGPR;
+ if (NextFreeVGPR->evaluateAsAbsolute(EvaluatedNumVGPR) &&
+ AccumEvaluatable &&
+ UEvaluatedAccum >
+ alignTo(std::max((uint64_t)1, (uint64_t)EvaluatedNumVGPR), 4))
return TokError("accum_offset exceeds total VGPR allocation");
- MCKernelDescriptor::bits_set(
- KD.compute_pgm_rsrc3,
- MCConstantExpr::create(AccumOffset / 4 - 1, getContext()),
- COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
- COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, getContext());
+ const MCExpr *AdjustedAccum = MCBinaryExpr::createSub(
+ MCBinaryExpr::createDiv(
+ AccumOffset, MCConstantExpr::create(4, getContext()), getContext()),
+ MCConstantExpr::create(1, getContext()), getContext());
+ MCKernelDescriptor::bits_set(KD.compute_pgm_rsrc3, AdjustedAccum,
+ COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
+ COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
+ getContext());
}
if (IVersion.Major >= 10 && IVersion.Major < 12) {
@@ -5840,7 +5884,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
return TokError("shared_vgpr_count directive not valid on "
"wavefront size 32");
}
- if (SharedVGPRCount * 2 + VGPRBlocks > 63) {
+
+ if (VGPRBlocksEvaluatable &&
+ (SharedVGPRCount * 2 + static_cast<uint64_t>(EvaluatedVGPRBlocks) >
+ 63)) {
return TokError("shared_vgpr_count*2 + "
"compute_pgm_rsrc1.GRANULATED_WORKITEM_VGPR_COUNT cannot "
"exceed 63\n");
@@ -8353,7 +8400,7 @@ void AMDGPUAsmParser::onBeginOfFile() {
/// max(expr, ...)
///
bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
- using AGVK = AMDGPUVariadicMCExpr::VariadicKind;
+ using AGVK = AMDGPUMCExpr::VariantKind;
if (isToken(AsmToken::Identifier)) {
StringRef TokenId = getTokenStr();
@@ -8383,7 +8430,7 @@ bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
"mismatch of commas in " + Twine(TokenId) + " expression");
return true;
}
- Res = AMDGPUVariadicMCExpr::create(VK, Exprs, getContext());
+ Res = AMDGPUMCExpr::create(VK, Exprs, getContext());
return false;
}
const MCExpr *Expr;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index b05834e5803a..3b8d94b74400 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -399,12 +399,10 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
class getLdStVDataRegisterOperand<RegisterClass RC, bit isTFE> {
RegisterOperand tfeVDataOp =
- !if(!eq(RC.Size, 32), AVLdSt_64,
- !if(!eq(RC.Size, 64), AVLdSt_96,
- !if(!eq(RC.Size, 96), AVLdSt_128,
- !if(!eq(RC.Size, 128), AVLdSt_160,
- RegisterOperand<VReg_1> // Invalid register.
- ))));
+ !cond(!eq(RC.Size, 32) : AVLdSt_64,
+ !eq(RC.Size, 64) : AVLdSt_96,
+ !eq(RC.Size, 96) : AVLdSt_128,
+ !eq(RC.Size, 128) : AVLdSt_160);
RegisterOperand ret = !if(isTFE, tfeVDataOp, getLdStRegisterOperand<RC>.ret);
}
@@ -534,7 +532,7 @@ multiclass MUBUF_Pseudo_Load_Pats_Common<string BaseInst, ValueType load_vt = i3
}
multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag>{
- let SubtargetPredicate = HasUnrestrictedSOffset in {
+ let OtherPredicates = [HasUnrestrictedSOffset] in {
defm : MUBUF_Pseudo_Load_Pats_Common<BaseInst, load_vt, ld>;
}
defm : MUBUF_Pseudo_Load_Pats_Common<BaseInst # "_VBUFFER", load_vt, ld>;
@@ -631,7 +629,7 @@ multiclass MUBUF_Pseudo_Store_Pats_Common<string BaseInst, ValueType store_vt =
}
multiclass MUBUF_Pseudo_Store_Pats<string BaseInst, ValueType store_vt = i32, SDPatternOperator st = null_frag> {
- let SubtargetPredicate = HasUnrestrictedSOffset in {
+ let OtherPredicates = [HasUnrestrictedSOffset] in {
defm : MUBUF_Pseudo_Store_Pats_Common<BaseInst, store_vt, st>;
}
defm : MUBUF_Pseudo_Store_Pats_Common<BaseInst # "_VBUFFER", store_vt, st>;
@@ -1151,27 +1149,21 @@ let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
"buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag
>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
"buffer_atomic_fmin", VGPR_32, f32, null_frag
>;
defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
"buffer_atomic_fmax", VGPR_32, f32, null_frag
>;
-
}
let SubtargetPredicate = isGFX6GFX7GFX10 in {
-
defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
"buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag
>;
-defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fmin_x2", VReg_64, f64, null_frag
->;
-defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fmax_x2", VReg_64, f64, null_frag
->;
-
}
let SubtargetPredicate = HasD16LoadStore in {
@@ -1235,12 +1227,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
"buffer_atomic_pk_add_f16", VGPR_32, v2f16
>;
-let OtherPredicates = [HasAtomicFaddRtnInsts] in
+let SubtargetPredicate = HasAtomicFaddRtnInsts in
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN<
"buffer_atomic_add_f32", VGPR_32, f32, null_frag
>;
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
"buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag
>;
@@ -1249,7 +1241,9 @@ let SubtargetPredicate = isGFX12Plus in {
defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics <
"buffer_atomic_cond_sub_u32", VGPR_32, i32
>;
+}
+let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in {
let FPAtomic = 1 in
defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics <
"buffer_atomic_pk_add_bf16", VGPR_32, v2bf16
@@ -1320,6 +1314,9 @@ let SubtargetPredicate = isGFX90APlus in {
let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>;
+
+ // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
+ // depending on some subtargets.
defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>;
defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>;
} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
@@ -1421,18 +1418,22 @@ let OtherPredicates = [HasPackedD16VMem] in {
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i16, "BUFFER_LOAD_DWORD">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f16, "BUFFER_LOAD_DWORD">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">;
+foreach vt = Reg32Types.types in {
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORD">;
+}
+
+foreach vt = Reg64Types.types in {
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX2">;
+}
+
+foreach vt = Reg96Types.types in {
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX3">;
+}
+
+foreach vt = Reg128Types.types in {
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX4">;
+}
+
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">;
@@ -1495,6 +1496,7 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3f32, "BUFFER_STORE_FORMAT_XYZ">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMAT_XYZ">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
@@ -1521,18 +1523,22 @@ let OtherPredicates = [HasPackedD16VMem] in {
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i16, "BUFFER_STORE_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i16, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f16, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i16, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f16, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">;
+foreach vt = Reg32Types.types in {
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORD">;
+}
+
+foreach vt = Reg64Types.types in {
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX2">;
+}
+
+foreach vt = Reg96Types.types in {
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX3">;
+}
+
+foreach vt = Reg128Types.types in {
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX4">;
+}
+
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
@@ -1545,7 +1551,7 @@ multiclass BufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst, bi
defvar Op = !cast<SDPatternOperator>(OpPrefix
# !if(!eq(RtnMode, "ret"), "", "_noret")
- # !if(isIntr, "", "_" # vt.Size));
+ # !if(isIntr, "", "_" # vt));
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
@@ -1582,7 +1588,7 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string
defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global"
# !if(!eq(RtnMode, "ret"), "", "_noret")
- # "_" # vt.Size);
+ # "_" # vt);
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
defvar data_vt_RC = getVregSrcForVT<data_vt>.ret.RegClass;
@@ -1641,6 +1647,16 @@ defm : BufferAtomicPat<"atomic_load_udec_wrap_global", Ty, "BUFFER_ATOMIC_DEC" #
} // end foreach Ty
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
+defm : BufferAtomicPat<"atomic_load_fmin_global", f32, "BUFFER_ATOMIC_FMIN">;
+defm : BufferAtomicPat<"atomic_load_fmax_global", f32, "BUFFER_ATOMIC_FMAX">;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
+defm : BufferAtomicPat<"atomic_load_fmin_global", f64, "BUFFER_ATOMIC_MIN_F64">;
+defm : BufferAtomicPat<"atomic_load_fmax_global", f64, "BUFFER_ATOMIC_MAX_F64">;
+}
+
defm : BufferAtomicCmpSwapPat<i32, v2i32, "BUFFER_ATOMIC_CMPSWAP">;
defm : BufferAtomicCmpSwapPat<i64, v2i64, "BUFFER_ATOMIC_CMPSWAP_X2">;
@@ -1695,9 +1711,11 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst,
multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
list<string> RtnModes = ["ret", "noret"]> {
- let SubtargetPredicate = HasUnrestrictedSOffset in {
+ let OtherPredicates = [HasUnrestrictedSOffset] in {
defm : SIBufferAtomicPat_Common<OpPrefix, vt, Inst, RtnModes>;
}
+
+ // FIXME: This needs a !HasUnrestrictedSOffset predicate
defm : SIBufferAtomicPat_Common<OpPrefix, vt, Inst # "_VBUFFER", RtnModes>;
}
@@ -1728,24 +1746,29 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i64, "BUFFER_ATOMIC_XOR_X2">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i64, "BUFFER_ATOMIC_INC_X2">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">;
-let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>;
+let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in {
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16">;
+}
+
let SubtargetPredicate = isGFX12Plus in {
- defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd_bf16", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">;
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["ret"]>;
+}
- let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>;
+let SubtargetPredicate = HasAtomicCSubNoRtnInsts in {
+defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>;
}
-let OtherPredicates = [isGFX6GFX7GFX10Plus] in {
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">;
}
-let SubtargetPredicate = isGFX6GFX7GFX10 in {
- defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_FMIN_X2">;
- defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_FMAX_X2">;
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">;
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
}
class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag <
@@ -1799,33 +1822,28 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
defm : BufferAtomicPatterns_NO_RTN_Common<name, vt, opcode # "_VBUFFER">;
}
-let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
+let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>;
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
- let SubtargetPredicate = isGFX9Only in
- defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>;
-
- let SubtargetPredicate = isGFX12Plus in
- defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["noret"]>;
-} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts]
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in {
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>;
+} // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts
-let OtherPredicates = [HasAtomicFaddRtnInsts] in
+let SubtargetPredicate = HasAtomicFaddRtnInsts in
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>;
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
- let SubtargetPredicate = isGFX9Only in
- defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
-
- let SubtargetPredicate = isGFX12Plus in
- defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["ret"]>;
-} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts]
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in {
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
+} // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts
-let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
+let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">;
+} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+} //End let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts
multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string Inst> {
foreach RtnMode = ["ret", "noret"] in {
@@ -1897,7 +1915,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
}
multiclass SIBufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> {
- let SubtargetPredicate = HasUnrestrictedSOffset in {
+ let OtherPredicates = [HasUnrestrictedSOffset] in {
defm : SIBufferAtomicCmpSwapPat_Common<vt, data_vt, Inst>;
}
defm : SIBufferAtomicCmpSwapPat_Common<vt, data_vt, Inst # "_VBUFFER">;
@@ -1948,7 +1966,7 @@ multiclass MUBUFLoad_PatternOffset_Common <string Instr, ValueType vt,
multiclass MUBUFLoad_PatternOffset <string Instr, ValueType vt,
PatFrag ld> {
- let SubtargetPredicate = HasUnrestrictedSOffset in {
+ let OtherPredicates = [HasUnrestrictedSOffset] in {
defm : MUBUFLoad_PatternOffset_Common<Instr, vt, ld>;
}
defm : MUBUFLoad_PatternOffset_Common<Instr # "_VBUFFER", vt, ld>;
@@ -2189,7 +2207,7 @@ multiclass MTBUF_LoadIntrinsicPat_Common<SDPatternOperator name, ValueType vt,
multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
- let SubtargetPredicate = HasUnrestrictedSOffset in {
+ let OtherPredicates = [HasUnrestrictedSOffset] in {
defm : MTBUF_LoadIntrinsicPat_Common<name, vt, opcode, memoryVt>;
}
defm : MTBUF_LoadIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>;
@@ -2204,7 +2222,7 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3f32, "TBUFFER_LOAD_FORMAT_XYZ">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
-let OtherPredicates = [HasUnpackedD16VMem] in {
+let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
@@ -2212,7 +2230,7 @@ let OtherPredicates = [HasUnpackedD16VMem] in {
defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
-let OtherPredicates = [HasPackedD16VMem] in {
+let SubtargetPredicate = HasPackedD16VMem in {
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
@@ -2261,7 +2279,7 @@ multiclass MTBUF_StoreIntrinsicPat_Common<SDPatternOperator name, ValueType vt,
multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
- let SubtargetPredicate = HasUnrestrictedSOffset in {
+ let OtherPredicates = [HasUnrestrictedSOffset] in {
defm : MTBUF_StoreIntrinsicPat_Common<name, vt, opcode, memoryVt>;
}
defm : MTBUF_StoreIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>;
@@ -2276,7 +2294,7 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY"
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3f32, "TBUFFER_STORE_FORMAT_XYZ">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
-let OtherPredicates = [HasUnpackedD16VMem] in {
+let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">;
@@ -2284,7 +2302,7 @@ let OtherPredicates = [HasUnpackedD16VMem] in {
defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
-let OtherPredicates = [HasPackedD16VMem] in {
+let SubtargetPredicate = HasPackedD16VMem in {
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">;
@@ -2296,6 +2314,12 @@ let OtherPredicates = [HasPackedD16VMem] in {
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
+// Shortcut to default Mnemonic from BUF_Pseudo. Hides the cast to the
+// specific pseudo (bothen in this case) since any of them will work.
+class get_BUF_ps<string name> {
+ string Mnemonic = !cast<BUF_Pseudo>(name # "_OFFSET").Mnemonic;
+}
+
//===----------------------------------------------------------------------===//
// Base ENC_MUBUF for GFX6, GFX7, GFX10, GFX11.
//===----------------------------------------------------------------------===//
@@ -2327,8 +2351,8 @@ multiclass MUBUF_Real_gfx11<bits<8> op, string real_name = !cast<MUBUF_Pseudo>(N
}
}
-class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> :
- Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, ef> {
+class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef, string asmName> :
+ Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, ef, asmName> {
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
@@ -2338,9 +2362,10 @@ class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> :
let Inst{55} = ps.tfe;
}
-multiclass MUBUF_Real_gfx10<bits<8> op> {
- defvar ps = !cast<MUBUF_Pseudo>(NAME);
- def _gfx10 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10> {
+multiclass MUBUF_Real_gfx10<bits<8> op, string psName = NAME,
+ string asmName = !cast<MUBUF_Pseudo>(psName).Mnemonic> {
+ defvar ps = !cast<MUBUF_Pseudo>(psName);
+ def _gfx10 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10, asmName> {
let Inst{15} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value);
let Inst{25} = op{7};
let AssemblerPredicate = isGFX10Only;
@@ -2348,9 +2373,10 @@ multiclass MUBUF_Real_gfx10<bits<8> op> {
}
}
-multiclass MUBUF_Real_gfx6_gfx7<bits<8> op> {
- defvar ps = !cast<MUBUF_Pseudo>(NAME);
- def _gfx6_gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> {
+multiclass MUBUF_Real_gfx6_gfx7<bits<8> op, string psName = NAME,
+ string asmName = !cast<MUBUF_Pseudo>(psName).Mnemonic> {
+ defvar ps = !cast<MUBUF_Pseudo>(psName);
+ def _gfx6_gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI, asmName> {
let Inst{15} = ps.addr64;
let AssemblerPredicate = isGFX6GFX7;
let DecoderNamespace = "GFX6GFX7";
@@ -2359,7 +2385,7 @@ multiclass MUBUF_Real_gfx6_gfx7<bits<8> op> {
multiclass MUBUF_Real_gfx6<bits<8> op> {
defvar ps = !cast<MUBUF_Pseudo>(NAME);
- def _gfx6 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> {
+ def _gfx6 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI, ps.Mnemonic> {
let Inst{15} = ps.addr64;
let AssemblerPredicate = isGFX6;
let DecoderNamespace = "GFX6";
@@ -2368,7 +2394,7 @@ multiclass MUBUF_Real_gfx6<bits<8> op> {
multiclass MUBUF_Real_gfx7<bits<8> op> {
defvar ps = !cast<MUBUF_Pseudo>(NAME);
- def _gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> {
+ def _gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI, ps.Mnemonic> {
let Inst{15} = ps.addr64;
let AssemblerPredicate = isGFX7Only;
let DecoderNamespace = "GFX7";
@@ -2445,9 +2471,15 @@ class VBUFFER_Real_gfx12<bits<8> op, BUF_Pseudo ps, string real_name> :
multiclass VBUFFER_MUBUF_Real_gfx12<bits<8> op, string real_name> {
defvar ps = !cast<MUBUF_Pseudo>(NAME);
def _gfx12 : VBUFFER_Real_gfx12<op, ps, real_name> {
- // Set the last bit of format to 1 to avoid round-trip issues, as some tools
+ // Set the format field to be 1 to avoid round-trip issues, as some tools
// print BUF_FMT_INVALID for format 0.
- let Inst{55} = 0b1;
+ let Inst{61-55} = 0b0000001;
+ }
+ // Have a version of the instruction to disassemble to for any other
+ // format field values.
+ def _gfx12_format : VBUFFER_Real<op, ps, real_name> {
+ let AsmVariantName = "NonParsable";
+ let DecoderNamespace = "GFX12";
}
}
@@ -2463,12 +2495,6 @@ multiclass VBUFFER_MTBUF_Real_gfx12<bits<4> op, string real_name> {
// MUBUF - GFX11, GFX12.
//===----------------------------------------------------------------------===//
-// Shortcut to default Mnemonic from BUF_Pseudo. Hides the cast to the
-// specific pseudo (bothen in this case) since any of them will work.
-class get_BUF_ps<string name> {
- string Mnemonic = !cast<BUF_Pseudo>(name # "_BOTHEN").Mnemonic;
-}
-
// gfx11 instruction that accept both old and new assembler name.
class Mnem_gfx11_gfx12 <string mnemonic, string real_name> :
AMDGPUMnemonicAlias<mnemonic, real_name> {
@@ -2690,18 +2716,20 @@ multiclass MUBUF_Real_AllAddr_Lds_gfx10<bits<8> op, bit isTFE = 0> {
defm _LDS_BOTHEN : MUBUF_Real_gfx10<op>;
}
}
-multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op> {
- defm _BOTHEN_RTN : MUBUF_Real_gfx10<op>;
- defm _IDXEN_RTN : MUBUF_Real_gfx10<op>;
- defm _OFFEN_RTN : MUBUF_Real_gfx10<op>;
- defm _OFFSET_RTN : MUBUF_Real_gfx10<op>;
+multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op, string psName = NAME,
+ string asmName = !cast<MUBUF_Pseudo>(psName).Mnemonic> {
+ defm _BOTHEN_RTN : MUBUF_Real_gfx10<op, psName#"_BOTHEN_RTN", asmName>;
+ defm _IDXEN_RTN : MUBUF_Real_gfx10<op, psName#"_IDXEN_RTN", asmName>;
+ defm _OFFEN_RTN : MUBUF_Real_gfx10<op, psName#"_OFFEN_RTN", asmName>;
+ defm _OFFSET_RTN : MUBUF_Real_gfx10<op, psName#"_OFFSET_RTN", asmName>;
}
-multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> :
- MUBUF_Real_Atomics_RTN_gfx10<op> {
- defm _BOTHEN : MUBUF_Real_gfx10<op>;
- defm _IDXEN : MUBUF_Real_gfx10<op>;
- defm _OFFEN : MUBUF_Real_gfx10<op>;
- defm _OFFSET : MUBUF_Real_gfx10<op>;
+multiclass MUBUF_Real_Atomics_gfx10<bits<8> op, string psName = NAME,
+ string asmName = get_BUF_ps<psName>.Mnemonic> :
+ MUBUF_Real_Atomics_RTN_gfx10<op, psName, asmName> {
+ defm _BOTHEN : MUBUF_Real_gfx10<op, psName#"_BOTHEN", asmName>;
+ defm _IDXEN : MUBUF_Real_gfx10<op, psName#"_IDXEN", asmName>;
+ defm _OFFEN : MUBUF_Real_gfx10<op, psName#"_OFFEN", asmName>;
+ defm _OFFSET : MUBUF_Real_gfx10<op, psName#"_OFFSET", asmName>;
}
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>;
@@ -2756,18 +2784,18 @@ multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7<bits<8> op, bit isTFE = 0> {
defm _LDS_BOTHEN : MUBUF_Real_gfx6_gfx7<op>;
}
}
-multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> {
- defm _ADDR64 : MUBUF_Real_gfx6_gfx7<op>;
- defm _BOTHEN : MUBUF_Real_gfx6_gfx7<op>;
- defm _IDXEN : MUBUF_Real_gfx6_gfx7<op>;
- defm _OFFEN : MUBUF_Real_gfx6_gfx7<op>;
- defm _OFFSET : MUBUF_Real_gfx6_gfx7<op>;
+multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op, string psName, string asmName> {
+ defm _ADDR64 : MUBUF_Real_gfx6_gfx7<op, psName#"_ADDR64", asmName>;
+ defm _BOTHEN : MUBUF_Real_gfx6_gfx7<op, psName#"_BOTHEN", asmName>;
+ defm _IDXEN : MUBUF_Real_gfx6_gfx7<op, psName#"_IDXEN", asmName>;
+ defm _OFFEN : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFEN", asmName>;
+ defm _OFFSET : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFSET", asmName>;
- defm _ADDR64_RTN : MUBUF_Real_gfx6_gfx7<op>;
- defm _BOTHEN_RTN : MUBUF_Real_gfx6_gfx7<op>;
- defm _IDXEN_RTN : MUBUF_Real_gfx6_gfx7<op>;
- defm _OFFEN_RTN : MUBUF_Real_gfx6_gfx7<op>;
- defm _OFFSET_RTN : MUBUF_Real_gfx6_gfx7<op>;
+ defm _ADDR64_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_ADDR64_RTN", asmName>;
+ defm _BOTHEN_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_BOTHEN_RTN", asmName>;
+ defm _IDXEN_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_IDXEN_RTN", asmName>;
+ defm _OFFEN_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFEN_RTN", asmName>;
+ defm _OFFSET_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFSET_RTN", asmName>;
}
multiclass MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<bits<8> op> :
@@ -2782,8 +2810,10 @@ multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<bits<8> op> {
defm _TFE : MUBUF_Real_AllAddr_Lds_Helper_gfx6_gfx7_gfx10<op, 1>;
}
-multiclass MUBUF_Real_Atomics_gfx6_gfx7_gfx10<bits<8> op> :
- MUBUF_Real_Atomics_gfx6_gfx7<op>, MUBUF_Real_Atomics_gfx10<op>;
+multiclass MUBUF_Real_Atomics_gfx6_gfx7_gfx10<bits<8> op, string psName = NAME,
+ string asmName = get_BUF_ps<psName>.Mnemonic> :
+ MUBUF_Real_Atomics_gfx6_gfx7<op, psName, asmName>,
+ MUBUF_Real_Atomics_gfx10<op, psName, asmName>;
// FIXME-GFX6: Following instructions are available only on GFX6.
//defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomics_gfx6 <0x034>;
@@ -2843,8 +2873,8 @@ defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05c>;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05d>;
// FIXME-GFX7: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on GFX7.
defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>;
-defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>;
-defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>;
+defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f, "BUFFER_ATOMIC_MIN_F64", "buffer_atomic_fmin_x2">;
+defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060, "BUFFER_ATOMIC_MAX_F64", "buffer_atomic_fmax_x2">;
defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomics_gfx10<0x034>;
@@ -3066,9 +3096,9 @@ multiclass MUBUF_Real_vi_gfx90a<bits<7> op, bit isTFE = 0> : MUBUF_Real_vi<op> {
}
if ps.FPAtomic then {
- let SubtargetPredicate = isGFX90AOnly,
- AssemblerPredicate = isGFX90AOnly in
- defm NAME : MUBUF_Real_gfx90a<op, 0>;
+ let AssemblerPredicate = isGFX90AOnly in
+ defm NAME : MUBUF_Real_gfx90a<op, 0>;
+
def _gfx940 : MUBUF_Real_gfx940<op, ps>;
}
}
@@ -3251,10 +3281,7 @@ defm BUFFER_WBINVL1_VOL : MUBUF_Real_vi <0x3f>;
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>;
-
-let SubtargetPredicate = HasAtomicFaddNoRtnInsts in {
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>;
-} // End SubtargetPredicate = HasAtomicFaddNoRtnInsts
let SubtargetPredicate = isGFX90APlus in {
defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_vi<0x4f>;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 19bb4300531c..219246b71fe8 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -965,16 +965,16 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">;
multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt)>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_"#vt)>;
}
let OtherPredicates = [HasGDS] in {
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt),
/* complexity */ 0, /* gds */ 1>;
}
}
@@ -983,24 +983,24 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_m0_"#vt)>;
def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size), /* complexity */ 1>;
+ !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_"#vt)>;
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>;
+ !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
}
let OtherPredicates = [HasGDS] in {
def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ !cast<PatFrag>(frag#"_region_m0_"#vt),
/* complexity */ 0, /* gds */ 1>;
def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
/* complexity */ 1, /* gds */ 1>;
}
}
@@ -1019,23 +1019,23 @@ class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag,
multiclass DSAtomicCmpXChgSwapped_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt,
string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
- def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size),
+ def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt)>;
+ def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt),
/* complexity */ 1>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_"#vt)>;
def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt.Size),
+ !cast<PatFrag>(frag#"_local_noret_"#vt),
/* complexity */ 1>;
}
let OtherPredicates = [HasGDS] in {
- def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt),
/* complexity */ 0, /* gds */ 1>;
- def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
/* complexity */ 1, /* gds */ 1>;
}
}
@@ -1053,14 +1053,14 @@ class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag,
multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> {
def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_"#vt)>;
def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>;
+ !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
let OtherPredicates = [HasGDS] in {
- def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt),
/* complexity */ 0, /* gds */ 1>;
- def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
/* complexity */ 1, /* gds */ 1>;
}
}
@@ -1082,6 +1082,12 @@ defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_U32, DS_MAX_U32, i32, "atomic_load_umax
defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_F32, DS_MIN_F32, f32, "atomic_load_fmin">;
defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_F32, DS_MAX_F32, f32, "atomic_load_fmax">;
+
+let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
+defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
+defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
+}
+
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
}
@@ -1119,9 +1125,9 @@ defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B64, DS_CMPSTORE_B64, i64, "atomic_cmp
} // End SubtargetPredicate = isGFX11Plus
let SubtargetPredicate = HasLdsAtomicAddF64 in {
-def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>;
+def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_f64>;
let AddedComplexity = 1 in
-def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>;
+def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_f64>;
class DSAtomicRetPatIntrinsic<DS_Pseudo inst, ValueType vt, PatFrag frag,
bit gds=0> : GCNPat <
@@ -1135,18 +1141,7 @@ def : DSAtomicRetPatIntrinsic<DS_ADD_F64, f64, int_amdgcn_flat_atomic_fadd_noret
}
let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
-def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>;
-let AddedComplexity = 1 in
-def : DSAtomicRetPat<DS_PK_ADD_F16, v2f16, atomic_load_fadd_v2f16_local_noret_32>;
-def : GCNPat <
- (v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)),
- (DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
->;
-let AddedComplexity = 1 in
-def : GCNPat <
- (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)),
- (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
->;
+defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
let OtherPredicates = [HasGDS] in
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 05063c6c321a..76a559c9443b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -21,6 +21,7 @@
#include "SIDefines.h"
#include "SIRegisterInfo.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
+#include "Utils/AMDGPUAsmUtils.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm-c/DisassemblerTypes.h"
#include "llvm/BinaryFormat/ELF.h"
@@ -52,6 +53,13 @@ AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
// ToDo: AMDGPUDisassembler supports only VI ISA.
if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus())
report_fatal_error("Disassembly not yet supported for subtarget");
+
+ for (auto [Symbol, Code] : AMDGPU::UCVersion::getGFXVersions())
+ createConstantSymbolExpr(Symbol, Code);
+
+ UCVersionW64Expr = createConstantSymbolExpr("UC_VERSION_W64_BIT", 0x2000);
+ UCVersionW32Expr = createConstantSymbolExpr("UC_VERSION_W32_BIT", 0x4000);
+ UCVersionMDPExpr = createConstantSymbolExpr("UC_VERSION_MDP_BIT", 0x8000);
}
void AMDGPUDisassembler::setABIVersion(unsigned Version) {
@@ -421,6 +429,13 @@ DECODE_SDWA(Src32)
DECODE_SDWA(Src16)
DECODE_SDWA(VopcDst)
+static DecodeStatus decodeVersionImm(MCInst &Inst, unsigned Imm,
+ uint64_t /* Addr */,
+ const MCDisassembler *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+ return addOperand(Inst, DAsm->decodeVersionImm(Imm));
+}
+
#include "AMDGPUGenDisassemblerTables.inc"
//===----------------------------------------------------------------------===//
@@ -1727,6 +1742,41 @@ MCOperand AMDGPUDisassembler::decodeDpp8FI(unsigned Val) const {
return MCOperand::createImm(Val);
}
+MCOperand AMDGPUDisassembler::decodeVersionImm(unsigned Imm) const {
+ using VersionField = AMDGPU::EncodingField<7, 0>;
+ using W64Bit = AMDGPU::EncodingBit<13>;
+ using W32Bit = AMDGPU::EncodingBit<14>;
+ using MDPBit = AMDGPU::EncodingBit<15>;
+ using Encoding = AMDGPU::EncodingFields<VersionField, W64Bit, W32Bit, MDPBit>;
+
+ auto [Version, W64, W32, MDP] = Encoding::decode(Imm);
+
+ // Decode into a plain immediate if any unused bits are raised.
+ if (Encoding::encode(Version, W64, W32, MDP) != Imm)
+ return MCOperand::createImm(Imm);
+
+ const auto &Versions = AMDGPU::UCVersion::getGFXVersions();
+ auto I = find_if(Versions,
+ [Version = Version](const AMDGPU::UCVersion::GFXVersion &V) {
+ return V.Code == Version;
+ });
+ MCContext &Ctx = getContext();
+ const MCExpr *E;
+ if (I == Versions.end())
+ E = MCConstantExpr::create(Version, Ctx);
+ else
+ E = MCSymbolRefExpr::create(Ctx.getOrCreateSymbol(I->Symbol), Ctx);
+
+ if (W64)
+ E = MCBinaryExpr::createOr(E, UCVersionW64Expr, Ctx);
+ if (W32)
+ E = MCBinaryExpr::createOr(E, UCVersionW32Expr, Ctx);
+ if (MDP)
+ E = MCBinaryExpr::createOr(E, UCVersionMDPExpr, Ctx);
+
+ return MCOperand::createExpr(E);
+}
+
bool AMDGPUDisassembler::isVI() const {
return STI.hasFeature(AMDGPU::FeatureVolcanicIslands);
}
@@ -2312,6 +2362,15 @@ Expected<bool> AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol,
return false;
}
+const MCExpr *AMDGPUDisassembler::createConstantSymbolExpr(StringRef Id,
+ int64_t Val) {
+ MCContext &Ctx = getContext();
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(Id);
+ assert(!Sym->isVariable());
+ Sym->setVariableValue(MCConstantExpr::create(Val, Ctx));
+ return MCSymbolRefExpr::create(Sym, Ctx);
+}
+
//===----------------------------------------------------------------------===//
// AMDGPUSymbolizer
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 2061d83af3da..694cd7a9bfd2 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -102,6 +102,11 @@ private:
mutable bool HasLiteral;
mutable std::optional<bool> EnableWavefrontSize32;
unsigned CodeObjectVersion;
+ const MCExpr *UCVersionW64Expr;
+ const MCExpr *UCVersionW32Expr;
+ const MCExpr *UCVersionMDPExpr;
+
+ const MCExpr *createConstantSymbolExpr(StringRef Id, int64_t Val);
public:
AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
@@ -264,6 +269,8 @@ public:
MCOperand decodeSplitBarrier(unsigned Val) const;
MCOperand decodeDpp8FI(unsigned Val) const;
+ MCOperand decodeVersionImm(unsigned Imm) const;
+
int getTTmpIdx(unsigned Val) const;
const MCInstrInfo *getMCII() const { return MCII.get(); }
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 3767dd0b6d47..280def5440c8 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -322,25 +322,25 @@ def : EGOrCaymanPat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$
$ptr), sub1)>;
defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_NORET,
- atomic_swap_global_noret_32>;
+ atomic_swap_global_noret_i32>;
defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_NORET,
- atomic_load_add_global_noret_32>;
+ atomic_load_add_global_noret_i32>;
defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_NORET,
- atomic_load_sub_global_noret_32>;
+ atomic_load_sub_global_noret_i32>;
defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_NORET,
- atomic_load_min_global_noret_32>;
+ atomic_load_min_global_noret_i32>;
defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_NORET,
- atomic_load_umin_global_noret_32>;
+ atomic_load_umin_global_noret_i32>;
defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_NORET,
- atomic_load_max_global_noret_32>;
+ atomic_load_max_global_noret_i32>;
defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_NORET,
- atomic_load_umax_global_noret_32>;
+ atomic_load_umax_global_noret_i32>;
defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_NORET,
- atomic_load_and_global_noret_32>;
+ atomic_load_and_global_noret_i32>;
defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_NORET,
- atomic_load_or_global_noret_32>;
+ atomic_load_or_global_noret_i32>;
defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_NORET,
- atomic_load_xor_global_noret_32>;
+ atomic_load_xor_global_noret_i32>;
// Should be predicated on FeatureFP64
// def FMA_64 : R600_3OP <
@@ -712,37 +712,37 @@ def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
[(truncstorei16_local i32:$src1, i32:$src0)]
>;
def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
- [(set i32:$dst, (atomic_load_add_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_add_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
- [(set i32:$dst, (atomic_load_sub_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_sub_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND",
- [(set i32:$dst, (atomic_load_and_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_and_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR",
- [(set i32:$dst, (atomic_load_or_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_or_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR",
- [(set i32:$dst, (atomic_load_xor_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_xor_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT",
- [(set i32:$dst, (atomic_load_min_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_min_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT",
- [(set i32:$dst, (atomic_load_max_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_max_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT",
- [(set i32:$dst, (atomic_load_umin_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_umin_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT",
- [(set i32:$dst, (atomic_load_umax_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_umax_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG",
- [(set i32:$dst, (atomic_swap_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_swap_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST",
- [(set i32:$dst, (atomic_cmp_swap_local_32 i32:$src0, i32:$src1, i32:$src2))]
+ [(set i32:$dst, (atomic_cmp_swap_local_i32 i32:$src0, i32:$src1, i32:$src2))]
>;
def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
[(set (i32 R600_Reg32:$dst), (load_local R600_Reg32:$src0))]
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index aab19b8adc27..98054dde398b 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -752,25 +752,29 @@ defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2",
// GFX7-, GFX10-only flat instructions.
let SubtargetPredicate = isGFX7GFX10 in {
-
defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2",
VReg_64, f64, v2f64, VReg_128>;
+} // End SubtargetPredicate = isGFX7GFX10
-defm FLAT_ATOMIC_FMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2",
- VReg_64, f64>;
-defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
- VReg_64, f64>;
+// The names may be flat_atomic_fmin_x2 on some subtargets, but we
+// choose this as the canonical name.
+let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
+defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo <"flat_atomic_min_f64",
+ VReg_64, f64>;
-} // End SubtargetPredicate = isGFX7GFX10
+defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo <"flat_atomic_max_f64",
+ VReg_64, f64>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
+defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
+defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
+}
let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>;
- defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64>;
- defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64>;
defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>;
- defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
- defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
let SubtargetPredicate = HasAtomicFlatPkAdd16Insts in {
@@ -972,6 +976,15 @@ defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_s
defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">;
let SubtargetPredicate = isGFX12Plus in {
+ let Uses = [EXEC, M0] in {
+ defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>;
+ defm GLOBAL_STORE_BLOCK : FLAT_Global_Store_Pseudo <"global_store_block", VReg_1024>;
+ }
+ let Uses = [EXEC, FLAT_SCR, M0] in {
+ defm SCRATCH_LOAD_BLOCK : FLAT_Scratch_Load_Pseudo <"scratch_load_block", VReg_1024>;
+ defm SCRATCH_STORE_BLOCK : FLAT_Scratch_Store_Pseudo <"scratch_store_block", VReg_1024>;
+ }
+
let WaveSizePredicate = isWave32 in {
let Mnemonic = "global_load_tr_b128" in
defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w32", VReg_128>;
@@ -995,10 +1008,6 @@ let SubtargetPredicate = isGFX10Plus in {
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>;
defm GLOBAL_ATOMIC_FCMPSWAP_X2 :
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, v2f64, VReg_128>;
- defm GLOBAL_ATOMIC_FMIN_X2 :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>;
- defm GLOBAL_ATOMIC_FMAX_X2 :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
} // End SubtargetPredicate = isGFX10Plus
let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
@@ -1105,7 +1114,7 @@ multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addr
multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
- FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt.Size), vt, data_vt>;
+ FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>;
multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt,
@@ -1123,7 +1132,7 @@ multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSp
multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
- FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt.Size), vt, data_vt>;
+ FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt), vt, data_vt>;
multiclass FlatAtomicPat <string inst, string node, ValueType vt,
@@ -1155,8 +1164,8 @@ class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node,
multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, int complexity = 0,
bit isIntr = 0> {
- defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt.Size));
- defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
+ defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt));
+ defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt));
let AddedComplexity = complexity in
def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>;
@@ -1165,21 +1174,6 @@ multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), noRtnNode, vt, data_vt>;
}
-multiclass FlatSignedAtomicIntrPat <string inst, string node, ValueType vt,
- ValueType data_vt = vt> {
- defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* complexity */ 0, /* isIntr */ 1>;
-}
-
-multiclass FlatSignedAtomicPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix,
- ValueType vt, ValueType data_vt = vt> {
- defvar noRtnNode = !cast<PatFrags>(intr # "_noret_" # addrSpaceSuffix);
- defvar rtnNode = !cast<PatFrags>(intr # "_" # addrSpaceSuffix);
-
- let AddedComplexity = 1 in
- def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), noRtnNode, vt, data_vt>;
- def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>;
-}
-
class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))),
(inst $vaddr, $offset)
@@ -1280,11 +1274,11 @@ multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt,
multiclass GlobalFLATAtomicPatsNoRtn<string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
- GlobalFLATAtomicPatsNoRtnBase<inst, node # "_noret" # !if(isIntr, "", "_" # vt.Size), vt, data_vt>;
+ GlobalFLATAtomicPatsNoRtnBase<inst, node # "_noret" # !if(isIntr, "", "_" # vt), vt, data_vt>;
multiclass GlobalFLATAtomicPatsRtn<string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
- GlobalFLATAtomicPatsRtnBase<inst, node # !if(isIntr, "", "_" # vt.Size), vt, data_vt>;
+ GlobalFLATAtomicPatsRtnBase<inst, node # !if(isIntr, "", "_" # vt), vt, data_vt>;
multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
@@ -1431,6 +1425,17 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_OR_X2", "atomic_load_or_"#as, i64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_SWAP_X2", "atomic_swap_"#as, i64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_"#as, i64, v2i64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>;
+
+let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in {
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_"#as, f32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_"#as, f32>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
+defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
+}
+
} // end foreach as
let SubtargetPredicate = isGFX12Plus in {
@@ -1592,37 +1597,26 @@ let OtherPredicates = [isGFX12Plus] in {
}
}
-let OtherPredicates = [isGFX10Plus] in {
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
-}
-
-let OtherPredicates = [isGFX10GFX11] in {
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
-
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>;
}
-let OtherPredicates = [isGFX10Only] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN_X2", "atomic_load_fmin_flat", f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX_X2", "atomic_load_fmax_flat", f64>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", "int_amdgcn_flat_atomic_fmin", f64>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", "int_amdgcn_flat_atomic_fmax", f64>;
+let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in {
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>;
}
let OtherPredicates = [isGFX12Only] in {
+ // FIXME: Remove these intrinsics
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>;
- defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
- defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>;
+ defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
+ defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>;
}
let OtherPredicates = [HasAtomicFaddNoRtnInsts] in {
@@ -1645,37 +1639,44 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
}
-let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
-defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>;
-defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>;
-defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", f64>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
+}
+
+let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
+defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>;
+defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
}
let OtherPredicates = [HasFlatAtomicFaddF32Inst] in {
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
-defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", f32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>;
}
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
-defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
}
let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
-
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
@@ -1745,8 +1746,8 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f
// CI
//===----------------------------------------------------------------------===//
-class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps> :
- FLAT_Real <op, ps>,
+class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps, string asmName = ps.Mnemonic> :
+ FLAT_Real <op, ps, asmName>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SI> {
let AssemblerPredicate = isGFX7Only;
let DecoderNamespace="GFX7";
@@ -1768,10 +1769,13 @@ def FLAT_STORE_DWORDX2_ci : FLAT_Real_ci <0x1d, FLAT_STORE_DWORDX2>;
def FLAT_STORE_DWORDX4_ci : FLAT_Real_ci <0x1e, FLAT_STORE_DWORDX4>;
def FLAT_STORE_DWORDX3_ci : FLAT_Real_ci <0x1f, FLAT_STORE_DWORDX3>;
-multiclass FLAT_Real_Atomics_ci <bits<7> op> {
- defvar ps = !cast<FLAT_Pseudo>(NAME);
- def _ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
- def _RTN_ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
+multiclass FLAT_Real_Atomics_ci <bits<7> op, string opName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(opName).Mnemonic> {
+ defvar ps = !cast<FLAT_Pseudo>(opName);
+ defvar ps_rtn = !cast<FLAT_Pseudo>(opName#"_RTN");
+
+ def _ci : FLAT_Real_ci<op, ps, asmName>;
+ def _RTN_ci : FLAT_Real_ci<op, ps_rtn, asmName>;
}
defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_ci <0x30>;
@@ -1806,8 +1810,8 @@ defm FLAT_ATOMIC_FCMPSWAP : FLAT_Real_Atomics_ci <0x3e>;
defm FLAT_ATOMIC_FMIN : FLAT_Real_Atomics_ci <0x3f>;
defm FLAT_ATOMIC_FMAX : FLAT_Real_Atomics_ci <0x40>;
defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_ci <0x5e>;
-defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_ci <0x5f>;
-defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60>;
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_ci <0x5f, "FLAT_ATOMIC_MIN_F64", "flat_atomic_fmin_x2">;
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60, "FLAT_ATOMIC_MAX_F64", "flat_atomic_fmax_x2">;
//===----------------------------------------------------------------------===//
@@ -2089,8 +2093,8 @@ let SubtargetPredicate = isGFX940Plus in {
// GFX10.
//===----------------------------------------------------------------------===//
-class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> :
- FLAT_Real<op, ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10> {
+class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
+ FLAT_Real<op, ps, opName>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10> {
let AssemblerPredicate = isGFX10Only;
let DecoderNamespace = "GFX10";
@@ -2102,25 +2106,28 @@ class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> :
let Inst{55} = 0;
}
-
-multiclass FLAT_Real_Base_gfx10<bits<7> op> {
+multiclass FLAT_Real_Base_gfx10<bits<7> op, string psName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(psName).Mnemonic> {
def _gfx10 :
- FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME)>;
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName), asmName>;
}
-multiclass FLAT_Real_RTN_gfx10<bits<7> op> {
+multiclass FLAT_Real_RTN_gfx10<bits<7> op, string psName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(psName).Mnemonic> {
def _RTN_gfx10 :
- FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_RTN")>;
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName#"_RTN"), asmName>;
}
-multiclass FLAT_Real_SADDR_gfx10<bits<7> op> {
+multiclass FLAT_Real_SADDR_gfx10<bits<7> op, string psName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(psName#"_SADDR").Mnemonic> {
def _SADDR_gfx10 :
- FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName#"_SADDR"), asmName>;
}
-multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op> {
+multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op, string psName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(psName#"_SADDR_RTN").Mnemonic> {
def _SADDR_RTN_gfx10 :
- FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>;
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName#"_SADDR_RTN"), asmName>;
}
multiclass FLAT_Real_ST_gfx10<bits<7> op> {
@@ -2128,22 +2135,25 @@ multiclass FLAT_Real_ST_gfx10<bits<7> op> {
FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_ST")>;
}
-multiclass FLAT_Real_AllAddr_gfx10<bits<7> op> :
- FLAT_Real_Base_gfx10<op>,
- FLAT_Real_SADDR_gfx10<op>;
+multiclass FLAT_Real_AllAddr_gfx10<bits<7> op, string OpName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(OpName).Mnemonic> :
+ FLAT_Real_Base_gfx10<op, OpName, asmName>,
+ FLAT_Real_SADDR_gfx10<op, OpName, asmName>;
-multiclass FLAT_Real_Atomics_gfx10<bits<7> op> :
- FLAT_Real_Base_gfx10<op>,
- FLAT_Real_RTN_gfx10<op>;
+multiclass FLAT_Real_Atomics_gfx10<bits<7> op, string OpName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(OpName).Mnemonic> :
+ FLAT_Real_Base_gfx10<op, OpName, asmName>,
+ FLAT_Real_RTN_gfx10<op, OpName, asmName>;
-multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op> :
- FLAT_Real_AllAddr_gfx10<op>,
- FLAT_Real_RTN_gfx10<op>,
- FLAT_Real_SADDR_RTN_gfx10<op>;
+multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op, string OpName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(OpName).Mnemonic> :
+ FLAT_Real_AllAddr_gfx10<op, OpName, asmName>,
+ FLAT_Real_RTN_gfx10<op, OpName, asmName>,
+ FLAT_Real_SADDR_RTN_gfx10<op, OpName, asmName>;
-multiclass FLAT_Real_GlblAtomics_RTN_gfx10<bits<7> op> :
- FLAT_Real_RTN_gfx10<op>,
- FLAT_Real_SADDR_RTN_gfx10<op>;
+multiclass FLAT_Real_GlblAtomics_RTN_gfx10<bits<7> op, string OpName = NAME> :
+ FLAT_Real_RTN_gfx10<op, OpName>,
+ FLAT_Real_SADDR_RTN_gfx10<op, OpName>;
multiclass FLAT_Real_ScratchAllAddr_gfx10<bits<7> op> :
FLAT_Real_Base_gfx10<op>,
@@ -2220,8 +2230,8 @@ defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_gfx10<0x05b>;
defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_gfx10<0x05c>;
defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_gfx10<0x05d>;
defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_gfx10<0x05e>;
-defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_gfx10<0x05f>;
-defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_gfx10<0x060>;
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_gfx10<0x05f, "FLAT_ATOMIC_MIN_F64", "flat_atomic_fmin_x2">;
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_gfx10<0x060, "FLAT_ATOMIC_MAX_F64", "flat_atomic_fmax_x2">;
// ENC_FLAT_GLBL.
@@ -2278,8 +2288,8 @@ defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Real_GlblAtomics_gfx10<0x05b>;
defm GLOBAL_ATOMIC_INC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05c>;
defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05d>;
defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x05e>;
-defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f>;
-defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>;
+defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f, "GLOBAL_ATOMIC_MIN_F64", "global_atomic_fmin_x2">;
+defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060, "GLOBAL_ATOMIC_MAX_F64", "global_atomic_fmax_x2">;
defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x016>;
defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x017>;
@@ -2671,6 +2681,8 @@ defm GLOBAL_STORE_BYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_s
defm GLOBAL_STORE_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">;
defm GLOBAL_LOAD_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">;
defm GLOBAL_STORE_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">;
+defm GLOBAL_LOAD_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x053>;
+defm GLOBAL_STORE_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x054>;
defm GLOBAL_ATOMIC_SWAP : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">;
defm GLOBAL_ATOMIC_CMPSWAP : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">;
@@ -2741,3 +2753,6 @@ defm SCRATCH_LOAD_SBYTE_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x22, "scratch_
defm SCRATCH_LOAD_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x23, "scratch_load_d16_hi_b16">;
defm SCRATCH_STORE_BYTE_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x24, "scratch_store_d16_hi_b8">;
defm SCRATCH_STORE_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x25, "scratch_store_d16_hi_b16">;
+
+defm SCRATCH_LOAD_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x53>;
+defm SCRATCH_STORE_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x54>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 94d93390d091..217279211531 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -116,31 +116,112 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
<< ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
}
+/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
+/// current register pressure.
+///
+/// This works for the common case, but it has a few exceptions that have been
+/// observed through trial and error:
+/// - Explicit physical register operands
+/// - Subregister definitions
+///
+/// In both of those cases, PressureDiff doesn't represent the actual pressure,
+/// and querying LiveIntervals through the RegPressureTracker is needed to get
+/// an accurate value.
+///
+/// We should eventually only use PressureDiff for maximum performance, but this
+/// already allows 80% of SUs to take the fast path without changing scheduling
+/// at all. Further changes would either change scheduling, or require a lot
+/// more logic to recover an accurate pressure estimate from the PressureDiffs.
+static bool canUsePressureDiffs(const SUnit &SU) {
+ if (!SU.isInstr())
+ return false;
+
+ // Cannot use pressure diffs for subregister defs or with physregs, it's
+ // imprecise in both cases.
+ for (const auto &Op : SU.getInstr()->operands()) {
+ if (!Op.isReg() || Op.isImplicit())
+ continue;
+ if (Op.getReg().isPhysical() ||
+ (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
+ return false;
+ }
+ return true;
+}
+
+static void getRegisterPressures(bool AtTop,
+ const RegPressureTracker &RPTracker, SUnit *SU,
+ std::vector<unsigned> &Pressure,
+ std::vector<unsigned> &MaxPressure) {
+ // getDownwardPressure() and getUpwardPressure() make temporary changes to
+ // the tracker, so we need to pass those function a non-const copy.
+ RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
+ if (AtTop)
+ TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
+ else
+ TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+}
+
void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
bool AtTop,
const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI,
unsigned SGPRPressure,
- unsigned VGPRPressure) {
+ unsigned VGPRPressure, bool IsBottomUp) {
Cand.SU = SU;
Cand.AtTop = AtTop;
if (!DAG->isTrackingPressure())
return;
- // getDownwardPressure() and getUpwardPressure() make temporary changes to
- // the tracker, so we need to pass those function a non-const copy.
- RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
-
Pressure.clear();
MaxPressure.clear();
- if (AtTop)
- TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
- else {
- // FIXME: I think for bottom up scheduling, the register pressure is cached
- // and can be retrieved by DAG->getPressureDif(SU).
- TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+ // We try to use the cached PressureDiffs in the ScheduleDAG whenever
+ // possible over querying the RegPressureTracker.
+ //
+ // RegPressureTracker will make a lot of LIS queries which are very
+ // expensive, it is considered a slow function in this context.
+ //
+ // PressureDiffs are precomputed and cached, and getPressureDiff is just a
+ // trivial lookup into an array. It is pretty much free.
+ //
+ // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
+ // PressureDiffs.
+ if (AtTop || !canUsePressureDiffs(*SU)) {
+ getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure);
+ } else {
+ // Reserve 4 slots.
+ Pressure.resize(4, 0);
+ Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
+ Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
+
+ for (const auto &Diff : DAG->getPressureDiff(SU)) {
+ if (!Diff.isValid())
+ continue;
+ // PressureDiffs is always bottom-up so if we're working top-down we need
+ // to invert its sign.
+ Pressure[Diff.getPSet()] +=
+ (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
+ }
+
+#ifdef EXPENSIVE_CHECKS
+ std::vector<unsigned> CheckPressure, CheckMaxPressure;
+ getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure);
+ if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
+ CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
+ Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
+ CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
+ errs() << "Register Pressure is inaccurate when calculated through "
+ "PressureDiff\n"
+ << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
+ << ", expected "
+ << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
+ << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
+ << ", expected "
+ << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
+ report_fatal_error("inaccurate register pressure calculation");
+ }
+#endif
}
unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
@@ -158,7 +239,6 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
-
// FIXME: We have to enter REG-EXCESS before we reach the actual threshold
// to increase the likelihood we don't go over the limits. We should improve
// the analysis to look through dependencies to find the path with the least
@@ -207,7 +287,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
- SchedCandidate &Cand) {
+ SchedCandidate &Cand,
+ bool IsBottomUp) {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
unsigned SGPRPressure = 0;
@@ -220,8 +301,8 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
for (SUnit *SU : Q) {
SchedCandidate TryCand(ZonePolicy);
- initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
- SGPRPressure, VGPRPressure);
+ initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
+ VGPRPressure, IsBottomUp);
// Pass SchedBoundary only when comparing nodes from the same boundary.
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
tryCandidate(Cand, TryCand, ZoneArg);
@@ -262,7 +343,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
if (!BotCand.isValid() || BotCand.SU->isScheduled ||
BotCand.Policy != BotPolicy) {
BotCand.reset(CandPolicy());
- pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
+ pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
+ /*IsBottomUp=*/true);
assert(BotCand.Reason != NoCand && "failed to find the first candidate");
} else {
LLVM_DEBUG(traceCandidate(BotCand));
@@ -270,7 +352,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
if (VerifyScheduling) {
SchedCandidate TCand;
TCand.reset(CandPolicy());
- pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
+ pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
+ /*IsBottomUp=*/true);
assert(TCand.SU == BotCand.SU &&
"Last pick result should correspond to re-picking right now");
}
@@ -282,7 +365,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
if (!TopCand.isValid() || TopCand.SU->isScheduled ||
TopCand.Policy != TopPolicy) {
TopCand.reset(CandPolicy());
- pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
+ pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
+ /*IsBottomUp=*/false);
assert(TopCand.Reason != NoCand && "failed to find the first candidate");
} else {
LLVM_DEBUG(traceCandidate(TopCand));
@@ -290,7 +374,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
if (VerifyScheduling) {
SchedCandidate TCand;
TCand.reset(CandPolicy());
- pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
+ pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
+ /*IsBottomUp=*/false);
assert(TCand.SU == TopCand.SU &&
"Last pick result should correspond to re-picking right now");
}
@@ -327,7 +412,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
if (!SU) {
CandPolicy NoPolicy;
TopCand.reset(NoPolicy);
- pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
+ pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
+ /*IsBottomUp=*/false);
assert(TopCand.Reason != NoCand && "failed to find a candidate");
SU = TopCand.SU;
}
@@ -337,7 +423,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
if (!SU) {
CandPolicy NoPolicy;
BotCand.reset(NoPolicy);
- pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
+ pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
+ /*IsBottomUp=*/true);
assert(BotCand.Reason != NoCand && "failed to find a candidate");
SU = BotCand.SU;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 2084aae4128f..f0aea2bc4ab8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -45,12 +45,12 @@ protected:
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
- SchedCandidate &Cand);
+ SchedCandidate &Cand, bool IsBottomUp);
- void initCandidate(SchedCandidate &Cand, SUnit *SU,
- bool AtTop, const RegPressureTracker &RPTracker,
- const SIRegisterInfo *SRI,
- unsigned SGPRPressure, unsigned VGPRPressure);
+ void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,
+ const RegPressureTracker &RPTracker,
+ const SIRegisterInfo *SRI, unsigned SGPRPressure,
+ unsigned VGPRPressure, bool IsBottomUp);
std::vector<unsigned> Pressure;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index db5b467f2238..07ff855756ec 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -159,6 +159,10 @@ protected:
bool HasFP8Insts = false;
bool HasFP8ConversionInsts = false;
bool HasPkFmacF16Inst = false;
+ bool HasAtomicFMinFMaxF32GlobalInsts = false;
+ bool HasAtomicFMinFMaxF64GlobalInsts = false;
+ bool HasAtomicFMinFMaxF32FlatInsts = false;
+ bool HasAtomicFMinFMaxF64FlatInsts = false;
bool HasAtomicDsPkAdd16Insts = false;
bool HasAtomicFlatPkAdd16Insts = false;
bool HasAtomicFaddRtnInsts = false;
@@ -167,6 +171,7 @@ protected:
bool HasAtomicBufferGlobalPkAddF16Insts = false;
bool HasAtomicCSubNoRtnInsts = false;
bool HasAtomicGlobalPkAddBF16Inst = false;
+ bool HasAtomicBufferPkAddBF16Inst = false;
bool HasFlatAtomicFaddF32Inst = false;
bool HasDefaultComponentZero = false;
bool HasDefaultComponentBroadcast = false;
@@ -820,6 +825,22 @@ public:
return HasPkFmacF16Inst;
}
+ bool hasAtomicFMinFMaxF32GlobalInsts() const {
+ return HasAtomicFMinFMaxF32GlobalInsts;
+ }
+
+ bool hasAtomicFMinFMaxF64GlobalInsts() const {
+ return HasAtomicFMinFMaxF64GlobalInsts;
+ }
+
+ bool hasAtomicFMinFMaxF32FlatInsts() const {
+ return HasAtomicFMinFMaxF32FlatInsts;
+ }
+
+ bool hasAtomicFMinFMaxF64FlatInsts() const {
+ return HasAtomicFMinFMaxF64FlatInsts;
+ }
+
bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
@@ -844,6 +865,10 @@ public:
return HasAtomicGlobalPkAddBF16Inst;
}
+ bool hasAtomicBufferPkAddBF16Inst() const {
+ return HasAtomicBufferPkAddBF16Inst;
+ }
+
bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
@@ -1547,6 +1572,8 @@ public:
bool hasFlatScratchInit() const { return FlatScratchInit; }
+ bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
+
unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
@@ -1611,6 +1638,8 @@ private:
bool FlatScratchInit = false;
+ bool PrivateSegmentSize = false;
+
unsigned NumKernargPreloadSGPRs = 0;
unsigned NumUsedUserSGPRs = 0;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 883b6c4407fe..bb5de368810d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -43,7 +43,6 @@ void AMDGPUInstPrinter::printRegName(raw_ostream &OS, MCRegister Reg) const {
void AMDGPUInstPrinter::printInst(const MCInst *MI, uint64_t Address,
StringRef Annot, const MCSubtargetInfo &STI,
raw_ostream &OS) {
- OS.flush();
printInstruction(MI, Address, STI, OS);
printAnnotation(OS, Annot);
}
@@ -57,9 +56,15 @@ void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isExpr()) {
+ Op.getExpr()->print(O, &MAI);
+ return;
+ }
+
// It's possible to end up with a 32-bit literal used with a 16-bit operand
// with ignored high bits. Print as 32-bit anyway in that case.
- int64_t Imm = MI->getOperand(OpNo).getImm();
+ int64_t Imm = Op.getImm();
if (isInt<16>(Imm) || isUInt<16>(Imm))
O << formatHex(static_cast<uint64_t>(Imm & 0xffff));
else
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index fb93f45e3e87..b3cca91f6380 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -662,6 +662,11 @@ void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128(
void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
const MCInst &MI, const MCOperand &MO, unsigned OpNo, APInt &Op,
SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
+ int64_t Val;
+ if (MO.isExpr() && MO.getExpr()->evaluateAsAbsolute(Val)) {
+ Op = Val;
+ return;
+ }
if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
// FIXME: If this is expression is PCRel or not should not depend on what
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index 159664faf983..83fbf4ac53d5 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -21,13 +21,11 @@
using namespace llvm;
using namespace llvm::AMDGPU;
-AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind,
- ArrayRef<const MCExpr *> Args,
- MCContext &Ctx)
+AMDGPUMCExpr::AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args,
+ MCContext &Ctx)
: Kind(Kind), Ctx(Ctx) {
assert(Args.size() >= 1 && "Needs a minimum of one expression.");
- assert(Kind != AGVK_None &&
- "Cannot construct AMDGPUVariadicMCExpr of kind none.");
+ assert(Kind != AGVK_None && "Cannot construct AMDGPUMCExpr of kind none.");
// Allocating the variadic arguments through the same allocation mechanism
// that the object itself is allocated with so they end up in the same memory.
@@ -40,25 +38,23 @@ AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind,
this->Args = ArrayRef<const MCExpr *>(RawArgs, Args.size());
}
-AMDGPUVariadicMCExpr::~AMDGPUVariadicMCExpr() { Ctx.deallocate(RawArgs); }
+AMDGPUMCExpr::~AMDGPUMCExpr() { Ctx.deallocate(RawArgs); }
-const AMDGPUVariadicMCExpr *
-AMDGPUVariadicMCExpr::create(VariadicKind Kind, ArrayRef<const MCExpr *> Args,
- MCContext &Ctx) {
- return new (Ctx) AMDGPUVariadicMCExpr(Kind, Args, Ctx);
+const AMDGPUMCExpr *AMDGPUMCExpr::create(VariantKind Kind,
+ ArrayRef<const MCExpr *> Args,
+ MCContext &Ctx) {
+ return new (Ctx) AMDGPUMCExpr(Kind, Args, Ctx);
}
-const MCExpr *AMDGPUVariadicMCExpr::getSubExpr(size_t Index) const {
- assert(Index < Args.size() &&
- "Indexing out of bounds AMDGPUVariadicMCExpr sub-expr");
+const MCExpr *AMDGPUMCExpr::getSubExpr(size_t Index) const {
+ assert(Index < Args.size() && "Indexing out of bounds AMDGPUMCExpr sub-expr");
return Args[Index];
}
-void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS,
- const MCAsmInfo *MAI) const {
+void AMDGPUMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
switch (Kind) {
default:
- llvm_unreachable("Unknown AMDGPUVariadicMCExpr kind.");
+ llvm_unreachable("Unknown AMDGPUMCExpr kind.");
case AGVK_Or:
OS << "or(";
break;
@@ -86,21 +82,19 @@ void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS,
OS << ')';
}
-static int64_t op(AMDGPUVariadicMCExpr::VariadicKind Kind, int64_t Arg1,
- int64_t Arg2) {
+static int64_t op(AMDGPUMCExpr::VariantKind Kind, int64_t Arg1, int64_t Arg2) {
switch (Kind) {
default:
- llvm_unreachable("Unknown AMDGPUVariadicMCExpr kind.");
- case AMDGPUVariadicMCExpr::AGVK_Max:
+ llvm_unreachable("Unknown AMDGPUMCExpr kind.");
+ case AMDGPUMCExpr::AGVK_Max:
return std::max(Arg1, Arg2);
- case AMDGPUVariadicMCExpr::AGVK_Or:
+ case AMDGPUMCExpr::AGVK_Or:
return Arg1 | Arg2;
}
}
-bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res,
- const MCAsmLayout *Layout,
- const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateExtraSGPRs(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
MCValue MCVal;
if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
@@ -112,7 +106,7 @@ bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res,
};
assert(Args.size() == 3 &&
- "AMDGPUVariadic Argument count incorrect for ExtraSGPRs");
+ "AMDGPUMCExpr Argument count incorrect for ExtraSGPRs");
const MCSubtargetInfo *STI = Ctx.getSubtargetInfo();
uint64_t VCCUsed = 0, FlatScrUsed = 0, XNACKUsed = 0;
@@ -129,9 +123,8 @@ bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res,
return true;
}
-bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res,
- const MCAsmLayout *Layout,
- const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateTotalNumVGPR(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
MCValue MCVal;
if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
@@ -142,7 +135,7 @@ bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res,
return true;
};
assert(Args.size() == 2 &&
- "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs");
+ "AMDGPUMCExpr Argument count incorrect for TotalNumVGPRs");
const MCSubtargetInfo *STI = Ctx.getSubtargetInfo();
uint64_t NumAGPR = 0, NumVGPR = 0;
@@ -158,9 +151,8 @@ bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res,
return true;
}
-bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res,
- const MCAsmLayout *Layout,
- const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateAlignTo(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
MCValue MCVal;
if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
@@ -172,7 +164,7 @@ bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res,
};
assert(Args.size() == 2 &&
- "AMDGPUVariadic Argument count incorrect for AlignTo");
+ "AMDGPUMCExpr Argument count incorrect for AlignTo");
uint64_t Value = 0, Align = 0;
if (!TryGetMCExprValue(Args[0], Value) || !TryGetMCExprValue(Args[1], Align))
return false;
@@ -181,9 +173,8 @@ bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res,
return true;
}
-bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res,
- const MCAsmLayout *Layout,
- const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateOccupancy(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
MCValue MCVal;
if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
@@ -194,7 +185,7 @@ bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res,
return true;
};
assert(Args.size() == 7 &&
- "AMDGPUVariadic Argument count incorrect for Occupancy");
+ "AMDGPUMCExpr Argument count incorrect for Occupancy");
uint64_t InitOccupancy, MaxWaves, Granule, TargetTotalNumVGPRs, Generation,
NumSGPRs, NumVGPRs;
@@ -226,8 +217,9 @@ bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res,
return true;
}
-bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
- MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
std::optional<int64_t> Total;
switch (Kind) {
@@ -258,12 +250,12 @@ bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
return true;
}
-void AMDGPUVariadicMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+void AMDGPUMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
for (const MCExpr *Arg : Args)
Streamer.visitUsedExpr(*Arg);
}
-MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const {
+MCFragment *AMDGPUMCExpr::findAssociatedFragment() const {
for (const MCExpr *Arg : Args) {
if (Arg->findAssociatedFragment())
return Arg->findAssociatedFragment();
@@ -275,18 +267,19 @@ MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const {
/// are unresolvable but needed for further MCExprs). Derived from
/// implementation of IsaInfo::getNumExtraSGPRs in AMDGPUBaseInfo.cpp.
///
-const AMDGPUVariadicMCExpr *
-AMDGPUVariadicMCExpr::createExtraSGPRs(const MCExpr *VCCUsed,
- const MCExpr *FlatScrUsed,
- bool XNACKUsed, MCContext &Ctx) {
+const AMDGPUMCExpr *AMDGPUMCExpr::createExtraSGPRs(const MCExpr *VCCUsed,
+ const MCExpr *FlatScrUsed,
+ bool XNACKUsed,
+ MCContext &Ctx) {
return create(AGVK_ExtraSGPRs,
{VCCUsed, FlatScrUsed, MCConstantExpr::create(XNACKUsed, Ctx)},
Ctx);
}
-const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR(
- const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx) {
+const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
+ const MCExpr *NumVGPR,
+ MCContext &Ctx) {
return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx);
}
@@ -295,10 +288,11 @@ const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR(
/// Remove dependency on GCNSubtarget and depend only only the necessary values
/// for said occupancy computation. Should match computeOccupancy implementation
/// without passing \p STM on.
-const AMDGPUVariadicMCExpr *
-AMDGPUVariadicMCExpr::createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
- const MCExpr *NumVGPRs,
- const GCNSubtarget &STM, MCContext &Ctx) {
+const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(unsigned InitOcc,
+ const MCExpr *NumSGPRs,
+ const MCExpr *NumVGPRs,
+ const GCNSubtarget &STM,
+ MCContext &Ctx) {
unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM);
unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index f92350b59235..207a619d45a1 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -17,7 +17,7 @@ namespace llvm {
class Function;
class GCNSubtarget;
-/// AMDGPU target specific variadic MCExpr operations.
+/// AMDGPU target specific MCExpr operations.
///
/// Takes in a minimum of 1 argument to be used with an operation. The supported
/// operations are:
@@ -27,9 +27,9 @@ class GCNSubtarget;
/// \note If the 'or'/'max' operations are provided only a single argument, the
/// operation will act as a no-op and simply resolve as the provided argument.
///
-class AMDGPUVariadicMCExpr : public MCTargetExpr {
+class AMDGPUMCExpr : public MCTargetExpr {
public:
- enum VariadicKind {
+ enum VariantKind {
AGVK_None,
AGVK_Or,
AGVK_Max,
@@ -40,14 +40,13 @@ public:
};
private:
- VariadicKind Kind;
+ VariantKind Kind;
MCContext &Ctx;
const MCExpr **RawArgs;
ArrayRef<const MCExpr *> Args;
- AMDGPUVariadicMCExpr(VariadicKind Kind, ArrayRef<const MCExpr *> Args,
- MCContext &Ctx);
- ~AMDGPUVariadicMCExpr();
+ AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
+ ~AMDGPUMCExpr();
bool evaluateExtraSGPRs(MCValue &Res, const MCAsmLayout *Layout,
const MCFixup *Fixup) const;
@@ -59,40 +58,39 @@ private:
const MCFixup *Fixup) const;
public:
- static const AMDGPUVariadicMCExpr *
- create(VariadicKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
+ static const AMDGPUMCExpr *
+ create(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
- static const AMDGPUVariadicMCExpr *createOr(ArrayRef<const MCExpr *> Args,
- MCContext &Ctx) {
- return create(VariadicKind::AGVK_Or, Args, Ctx);
+ static const AMDGPUMCExpr *createOr(ArrayRef<const MCExpr *> Args,
+ MCContext &Ctx) {
+ return create(VariantKind::AGVK_Or, Args, Ctx);
}
- static const AMDGPUVariadicMCExpr *createMax(ArrayRef<const MCExpr *> Args,
- MCContext &Ctx) {
- return create(VariadicKind::AGVK_Max, Args, Ctx);
+ static const AMDGPUMCExpr *createMax(ArrayRef<const MCExpr *> Args,
+ MCContext &Ctx) {
+ return create(VariantKind::AGVK_Max, Args, Ctx);
}
- static const AMDGPUVariadicMCExpr *createExtraSGPRs(const MCExpr *VCCUsed,
- const MCExpr *FlatScrUsed,
- bool XNACKUsed,
- MCContext &Ctx);
+ static const AMDGPUMCExpr *createExtraSGPRs(const MCExpr *VCCUsed,
+ const MCExpr *FlatScrUsed,
+ bool XNACKUsed, MCContext &Ctx);
- static const AMDGPUVariadicMCExpr *createTotalNumVGPR(const MCExpr *NumAGPR,
- const MCExpr *NumVGPR,
- MCContext &Ctx);
+ static const AMDGPUMCExpr *createTotalNumVGPR(const MCExpr *NumAGPR,
+ const MCExpr *NumVGPR,
+ MCContext &Ctx);
- static const AMDGPUVariadicMCExpr *
+ static const AMDGPUMCExpr *
createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx) {
- return create(VariadicKind::AGVK_AlignTo, {Value, Align}, Ctx);
+ return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
}
- static const AMDGPUVariadicMCExpr *createOccupancy(unsigned InitOcc,
- const MCExpr *NumSGPRs,
- const MCExpr *NumVGPRs,
- const GCNSubtarget &STM,
- MCContext &Ctx);
+ static const AMDGPUMCExpr *createOccupancy(unsigned InitOcc,
+ const MCExpr *NumSGPRs,
+ const MCExpr *NumVGPRs,
+ const GCNSubtarget &STM,
+ MCContext &Ctx);
- VariadicKind getKind() const { return Kind; }
+ VariantKind getKind() const { return Kind; }
const MCExpr *getSubExpr(size_t Index) const;
void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index e805e964ffe4..531031b58034 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -319,8 +319,9 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
- const MCKernelDescriptor &KD, uint64_t NextVGPR, uint64_t NextSGPR,
- bool ReserveVCC, bool ReserveFlatScr) {
+ const MCKernelDescriptor &KD, const MCExpr *NextVGPR,
+ const MCExpr *NextSGPR, const MCExpr *ReserveVCC,
+ const MCExpr *ReserveFlatScr) {
IsaVersion IVersion = getIsaVersion(STI.getCPU());
const MCAsmInfo *MAI = getContext().getAsmInfo();
@@ -339,16 +340,25 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
OS << '\n';
};
+ auto EmitMCExpr = [&](const MCExpr *Value) {
+ int64_t evaluatableValue;
+ if (Value->evaluateAsAbsolute(evaluatableValue)) {
+ OS << static_cast<uint64_t>(evaluatableValue);
+ } else {
+ Value->print(OS, MAI);
+ }
+ };
+
OS << "\t\t.amdhsa_group_segment_fixed_size ";
- KD.group_segment_fixed_size->print(OS, MAI);
+ EmitMCExpr(KD.group_segment_fixed_size);
OS << '\n';
OS << "\t\t.amdhsa_private_segment_fixed_size ";
- KD.private_segment_fixed_size->print(OS, MAI);
+ EmitMCExpr(KD.private_segment_fixed_size);
OS << '\n';
OS << "\t\t.amdhsa_kernarg_size ";
- KD.kernarg_size->print(OS, MAI);
+ EmitMCExpr(KD.kernarg_size);
OS << '\n';
PrintField(
@@ -433,8 +443,13 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
".amdhsa_system_vgpr_workitem_id");
// These directives are required.
- OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n';
- OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n';
+ OS << "\t\t.amdhsa_next_free_vgpr ";
+ EmitMCExpr(NextVGPR);
+ OS << '\n';
+
+ OS << "\t\t.amdhsa_next_free_sgpr ";
+ EmitMCExpr(NextSGPR);
+ OS << '\n';
if (AMDGPU::isGFX90A(STI)) {
// MCExpr equivalent of taking the (accum_offset + 1) * 4.
@@ -447,19 +462,19 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
accum_bits = MCBinaryExpr::createMul(
accum_bits, MCConstantExpr::create(4, getContext()), getContext());
OS << "\t\t.amdhsa_accum_offset ";
- int64_t IVal;
- if (accum_bits->evaluateAsAbsolute(IVal)) {
- OS << static_cast<uint64_t>(IVal);
- } else {
- accum_bits->print(OS, MAI);
- }
+ EmitMCExpr(accum_bits);
OS << '\n';
}
- if (!ReserveVCC)
- OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
- if (IVersion.Major >= 7 && !ReserveFlatScr && !hasArchitectedFlatScratch(STI))
- OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
+ OS << "\t\t.amdhsa_reserve_vcc ";
+ EmitMCExpr(ReserveVCC);
+ OS << '\n';
+
+ if (IVersion.Major >= 7 && !hasArchitectedFlatScratch(STI)) {
+ OS << "\t\t.amdhsa_reserve_flat_scratch ";
+ EmitMCExpr(ReserveFlatScr);
+ OS << '\n';
+ }
switch (CodeObjectVersion) {
default:
@@ -915,8 +930,9 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
- const MCKernelDescriptor &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {
+ const MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR,
+ const MCExpr *NextSGPR, const MCExpr *ReserveVCC,
+ const MCExpr *ReserveFlatScr) {
auto &Streamer = getStreamer();
auto &Context = Streamer.getContext();
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index e5c90060cb5d..bf1538c71d15 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -94,8 +94,9 @@ public:
virtual void
EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
const AMDGPU::MCKernelDescriptor &KernelDescriptor,
- uint64_t NextVGPR, uint64_t NextSGPR,
- bool ReserveVCC, bool ReserveFlatScr) {}
+ const MCExpr *NextVGPR, const MCExpr *NextSGPR,
+ const MCExpr *ReserveVCC,
+ const MCExpr *ReserveFlatScr) {}
static StringRef getArchNameFromElfMach(unsigned ElfMach);
static unsigned getElfMach(StringRef GPU);
@@ -151,8 +152,9 @@ public:
void
EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
const AMDGPU::MCKernelDescriptor &KernelDescriptor,
- uint64_t NextVGPR, uint64_t NextSGPR,
- bool ReserveVCC, bool ReserveFlatScr) override;
+ const MCExpr *NextVGPR, const MCExpr *NextSGPR,
+ const MCExpr *ReserveVCC,
+ const MCExpr *ReserveFlatScr) override;
};
class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
@@ -207,8 +209,9 @@ public:
void
EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
const AMDGPU::MCKernelDescriptor &KernelDescriptor,
- uint64_t NextVGPR, uint64_t NextSGPR,
- bool ReserveVCC, bool ReserveFlatScr) override;
+ const MCExpr *NextVGPR, const MCExpr *NextSGPR,
+ const MCExpr *ReserveVCC,
+ const MCExpr *ReserveFlatScr) override;
};
}
#endif
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
index 22d0594e2b86..56a23e26b8d9 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
@@ -21,7 +21,6 @@ using namespace llvm;
void R600InstPrinter::printInst(const MCInst *MI, uint64_t Address,
StringRef Annot, const MCSubtargetInfo &STI,
raw_ostream &O) {
- O.flush();
printInstruction(MI, Address, O);
printAnnotation(O, Annot);
}
diff --git a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
index 0a96c643d9bd..1a73fdf028c9 100644
--- a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
@@ -113,8 +113,8 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
- AU.addRequired<MachinePostDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -140,9 +140,9 @@ public:
FuncRep = &MF;
MLI = &getAnalysis<MachineLoopInfo>();
LLVM_DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
- MDT = &getAnalysis<MachineDominatorTree>();
- LLVM_DEBUG(MDT->print(dbgs(), (const Module *)nullptr););
- PDT = &getAnalysis<MachinePostDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+ LLVM_DEBUG(MDT->print(dbgs()););
+ PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
LLVM_DEBUG(PDT->print(dbgs()););
prepare();
run();
@@ -1629,8 +1629,8 @@ void R600MachineCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
INITIALIZE_PASS_BEGIN(R600MachineCFGStructurizer, "amdgpustructurizer",
"AMDGPU CFG Structurizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_END(R600MachineCFGStructurizer, "amdgpustructurizer",
"AMDGPU CFG Structurizer", false, false)
diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 77935cb4cde1..8bac570d59d4 100644
--- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -103,8 +103,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfo>();
AU.addPreserved<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index 59e274787590..64185db02ec1 100644
--- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -35,8 +35,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfo>();
AU.addPreserved<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index a00ca625fc73..68c5f23c8e11 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -162,8 +162,8 @@ public:
StringRef getPassName() const override { return "SI Fix SGPR copies"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -173,7 +173,7 @@ public:
INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)
@@ -611,8 +611,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
TRI = ST.getRegisterInfo();
TII = ST.getInstrInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
-
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 5c411a095587..7bf6a635158e 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1519,6 +1519,9 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
case AMDGPU::V_MAX_F64_e64:
case AMDGPU::V_MAX_NUM_F64_e64:
case AMDGPU::V_PK_MAX_F16: {
+ if (MI.mayRaiseFPException())
+ return nullptr;
+
if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
return nullptr;
@@ -1565,6 +1568,9 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
if (TII->getClampMask(*Def) != TII->getClampMask(MI))
return false;
+ if (Def->mayRaiseFPException())
+ return false;
+
MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
if (!DefClamp)
return false;
@@ -1650,7 +1656,9 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
Op == AMDGPU::V_MUL_F16_fake16_e64) &&
- MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
+ MFI->getMode().FP64FP16Denormals.Output !=
+ DenormalMode::PreserveSign) ||
+ MI.mayRaiseFPException())
return std::pair(nullptr, SIOutMods::NONE);
const MachineOperand *RegOp = nullptr;
@@ -1725,6 +1733,9 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
return false;
+ if (Def->mayRaiseFPException())
+ return false;
+
// Clamp is applied after omod. If the source already has clamp set, don't
// fold it.
if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4d8667affdb4..83bfb622ee52 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -791,8 +791,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
// Split vector operations.
setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
- ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
- ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
+ ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
+ ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
ISD::SSUBSAT},
VT, Custom);
@@ -859,19 +859,22 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
- MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
+ MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
+ MVT::i8},
Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN,
- {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
- MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
- MVT::i16, MVT::i8, MVT::i128},
+ {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
+ MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
+ MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
+ MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
Custom);
setOperationAction(ISD::INTRINSIC_VOID,
- {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
- MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
- MVT::i8, MVT::i128},
+ {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
+ MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
+ MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
+ MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
Custom);
setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
@@ -942,6 +945,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ATOMIC_LOAD_UMIN,
ISD::ATOMIC_LOAD_UMAX,
ISD::ATOMIC_LOAD_FADD,
+ ISD::ATOMIC_LOAD_FMIN,
+ ISD::ATOMIC_LOAD_FMAX,
ISD::ATOMIC_LOAD_UINC_WRAP,
ISD::ATOMIC_LOAD_UDEC_WRAP,
ISD::INTRINSIC_VOID,
@@ -1109,29 +1114,33 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
}
-static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
+static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
+ const DataLayout &DL, Type *Ty,
+ unsigned MaxNumLanes) {
assert(MaxNumLanes != 0);
+ LLVMContext &Ctx = Ty->getContext();
if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
- return EVT::getVectorVT(Ty->getContext(),
- EVT::getEVT(VT->getElementType()),
+ return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
NumElts);
}
- return EVT::getEVT(Ty);
+ return TLI.getValueType(DL, Ty);
}
// Peek through TFE struct returns to only use the data size.
-static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
+static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
+ const DataLayout &DL, Type *Ty,
+ unsigned MaxNumLanes) {
auto *ST = dyn_cast<StructType>(Ty);
if (!ST)
- return memVTFromLoadIntrData(Ty, MaxNumLanes);
+ return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
// TFE intrinsics return an aggregate type.
assert(ST->getNumContainedTypes() == 2 &&
ST->getContainedType(1)->isIntegerTy(32));
- return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
+ return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
}
/// Map address space 7 to MVT::v5i32 because that's its in-memory
@@ -1200,9 +1209,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOVolatile;
Info.flags |= MachineMemOperand::MODereferenceable;
if (ME.onlyReadsMemory()) {
- unsigned MaxNumLanes = 4;
-
if (RsrcIntr->IsImage) {
+ unsigned MaxNumLanes = 4;
+
const AMDGPU::ImageDimIntrinsicInfo *Intr
= AMDGPU::getImageDimIntrinsicInfo(IntrID);
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
@@ -1215,9 +1224,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
= cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
}
- }
- Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
+ Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
+ CI.getType(), MaxNumLanes);
+ } else {
+ Info.memVT =
+ memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
+ std::numeric_limits<unsigned>::max());
+ }
// FIXME: What does alignment mean for an image?
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1229,9 +1243,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
if (RsrcIntr->IsImage) {
unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
- Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
+ Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
+ DMaskLanes);
} else
- Info.memVT = EVT::getEVT(DataTy);
+ Info.memVT = getValueType(MF.getDataLayout(), DataTy);
Info.flags |= MachineMemOperand::MOStore;
} else {
@@ -1265,7 +1280,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
switch (IntrID) {
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1280,19 +1294,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
- case Intrinsic::amdgcn_buffer_atomic_fadd: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
- Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
- Info.align.reset();
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-
- const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
- if (!Vol || !Vol->isZero())
- Info.flags |= MachineMemOperand::MOVolatile;
-
- return true;
- }
case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1449,7 +1450,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_ordered_add:
@@ -1610,6 +1610,16 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
return false;
}
+ if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
+ AM.BaseOffs < 0) {
+ // Scalar (non-buffer) loads can only use a negative offset if
+ // soffset+offset is non-negative. Since the compiler can only prove that
+ // in a few special cases, it is safer to claim that negative offsets are
+ // not supported.
+ return false;
+ }
+
if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
return true;
@@ -2468,6 +2478,12 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(FlatScratchInitReg);
}
+ if (UserSGPRInfo.hasPrivateSegmentSize()) {
+ Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
+ MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(PrivateSegmentSizeReg);
+ }
+
// TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
// these from the dispatch pointer.
}
@@ -5811,6 +5827,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerTRAP(Op, DAG);
case ISD::DEBUGTRAP:
return lowerDEBUGTRAP(Op, DAG);
+ case ISD::ABS:
case ISD::FABS:
case ISD::FNEG:
case ISD::FCANONICALIZE:
@@ -6097,6 +6114,184 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
}
+static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
+ SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ unsigned ValSize = VT.getSizeInBits();
+ unsigned IID = N->getConstantOperandVal(0);
+ bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
+ IID == Intrinsic::amdgcn_permlanex16;
+ SDLoc SL(N);
+ MVT IntVT = MVT::getIntegerVT(ValSize);
+
+ auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
+ SDValue Src2, MVT ValT) -> SDValue {
+ SmallVector<SDValue, 8> Operands;
+ switch (IID) {
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ Operands.push_back(N->getOperand(6));
+ Operands.push_back(N->getOperand(5));
+ Operands.push_back(N->getOperand(4));
+ [[fallthrough]];
+ case Intrinsic::amdgcn_writelane:
+ Operands.push_back(Src2);
+ [[fallthrough]];
+ case Intrinsic::amdgcn_readlane:
+ Operands.push_back(Src1);
+ [[fallthrough]];
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_permlane64:
+ Operands.push_back(Src0);
+ break;
+ default:
+ llvm_unreachable("unhandled lane op");
+ }
+
+ Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
+ std::reverse(Operands.begin(), Operands.end());
+
+ if (SDNode *GL = N->getGluedNode()) {
+ assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
+ GL = GL->getOperand(0).getNode();
+ Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
+ SDValue(GL, 0)));
+ }
+
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
+ };
+
+ SDValue Src0 = N->getOperand(1);
+ SDValue Src1, Src2;
+ if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
+ IsPermLane16) {
+ Src1 = N->getOperand(2);
+ if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
+ Src2 = N->getOperand(3);
+ }
+
+ if (ValSize == 32) {
+ // Already legal
+ return SDValue();
+ }
+
+ if (ValSize < 32) {
+ bool IsFloat = VT.isFloatingPoint();
+ Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
+ SL, MVT::i32);
+
+ if (IsPermLane16) {
+ Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
+ SL, MVT::i32);
+ }
+
+ if (IID == Intrinsic::amdgcn_writelane) {
+ Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
+ SL, MVT::i32);
+ }
+
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
+ SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
+ return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
+ }
+
+ if (ValSize % 32 != 0)
+ return SDValue();
+
+ auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
+ EVT VT = N->getValueType(0);
+ unsigned NE = VT.getVectorNumElements();
+ EVT EltVT = VT.getVectorElementType();
+ SmallVector<SDValue, 8> Scalars;
+ unsigned NumOperands = N->getNumOperands();
+ SmallVector<SDValue, 4> Operands(NumOperands);
+ SDNode *GL = N->getGluedNode();
+
+ // only handle convergencectrl_glue
+ assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
+
+ for (unsigned i = 0; i != NE; ++i) {
+ for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
+ ++j) {
+ SDValue Operand = N->getOperand(j);
+ EVT OperandVT = Operand.getValueType();
+ if (OperandVT.isVector()) {
+ // A vector operand; extract a single element.
+ EVT OperandEltVT = OperandVT.getVectorElementType();
+ Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
+ Operand, DAG.getVectorIdxConstant(i, SL));
+ } else {
+ // A scalar operand; just use it as is.
+ Operands[j] = Operand;
+ }
+ }
+
+ if (GL)
+ Operands[NumOperands - 1] =
+ DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
+ SDValue(GL->getOperand(0).getNode(), 0));
+
+ Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
+ }
+
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
+ return DAG.getBuildVector(VecVT, SL, Scalars);
+ };
+
+ if (VT.isVector()) {
+ switch (MVT::SimpleValueType EltTy =
+ VT.getVectorElementType().getSimpleVT().SimpleTy) {
+ case MVT::i32:
+ case MVT::f32: {
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
+ return unrollLaneOp(LaneOp.getNode());
+ }
+ case MVT::i16:
+ case MVT::f16:
+ case MVT::bf16: {
+ MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
+ SmallVector<SDValue, 4> Pieces;
+ SDValue Src0SubVec, Src1SubVec, Src2SubVec;
+ for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
+ Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+
+ if (IsPermLane16)
+ Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+
+ if (IID == Intrinsic::amdgcn_writelane)
+ Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+
+ Pieces.push_back(
+ IsPermLane16
+ ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
+ : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
+ EltIdx += 2;
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
+ }
+ default:
+ // Handle all other cases by bitcasting to i32 vectors
+ break;
+ }
+ }
+
+ MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
+ Src0 = DAG.getBitcast(VecVT, Src0);
+
+ if (IsPermLane16)
+ Src1 = DAG.getBitcast(VecVT, Src1);
+
+ if (IID == Intrinsic::amdgcn_writelane)
+ Src2 = DAG.getBitcast(VecVT, Src2);
+
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
+ SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
+ return DAG.getBitcast(VT, UnrolledLaneOp);
+}
+
void SITargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
@@ -8563,6 +8758,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_addrspacecast_nonnull:
return lowerADDRSPACECAST(Op, DAG);
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_writelane:
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ case Intrinsic::amdgcn_permlane64:
+ return lowerLaneOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -8609,12 +8811,6 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
M->getMemOperand());
}
-// Return a value to use for the idxen operand by examining the vindex operand.
-static unsigned getIdxEn(SDValue VIndex) {
- // No need to set idxen if vindex is known to be zero.
- return isNullConstant(VIndex) ? 0 : 1;
-}
-
SDValue
SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
unsigned NewOpcode) const {
@@ -8703,78 +8899,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
- case Intrinsic::amdgcn_ds_fadd: {
- MemSDNode *M = cast<MemSDNode>(Op);
- unsigned Opc;
- switch (IntrID) {
- case Intrinsic::amdgcn_ds_fadd:
- Opc = ISD::ATOMIC_LOAD_FADD;
- break;
- }
-
- return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
- M->getOperand(0), M->getOperand(2), M->getOperand(3),
- M->getMemOperand());
- }
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
- unsigned Opc;
- switch (IntrID) {
- case Intrinsic::amdgcn_ds_fmin:
- Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
- break;
- case Intrinsic::amdgcn_ds_fmax:
- Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
- break;
- default:
- llvm_unreachable("Unknown intrinsic!");
- }
- SDValue Ops[] = {
- M->getOperand(0), // Chain
- M->getOperand(2), // Ptr
- M->getOperand(3) // Value
- };
-
- return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
- M->getMemoryVT(), M->getMemOperand());
- }
- case Intrinsic::amdgcn_buffer_load:
- case Intrinsic::amdgcn_buffer_load_format: {
- unsigned Glc = Op.getConstantOperandVal(5);
- unsigned Slc = Op.getConstantOperandVal(6);
- unsigned IdxEn = getIdxEn(Op.getOperand(3));
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // rsrc
- Op.getOperand(3), // vindex
- SDValue(), // voffset -- will be set by setBufferOffsets
- SDValue(), // soffset -- will be set by setBufferOffsets
- SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
-
- unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
- AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
-
- EVT VT = Op.getValueType();
- EVT IntVT = VT.changeTypeToInteger();
- auto *M = cast<MemSDNode>(Op);
- EVT LoadVT = Op.getValueType();
-
- if (LoadVT.getScalarType() == MVT::f16)
- return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
- M, DAG, Ops);
-
- // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
- if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops,
- M->getMemOperand());
-
- return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand(), DAG);
+ unsigned Opc = IntrID == Intrinsic::amdgcn_ds_fmin ? ISD::ATOMIC_LOAD_FMIN
+ : ISD::ATOMIC_LOAD_FMAX;
+ return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), M->getOperand(0),
+ M->getOperand(2), M->getOperand(3),
+ M->getMemOperand());
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
@@ -8825,35 +8957,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
}
- case Intrinsic::amdgcn_tbuffer_load: {
- MemSDNode *M = cast<MemSDNode>(Op);
- EVT LoadVT = Op.getValueType();
-
- auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
- unsigned Dfmt = Op.getConstantOperandVal(7);
- unsigned Nfmt = Op.getConstantOperandVal(8);
- unsigned Glc = Op.getConstantOperandVal(9);
- unsigned Slc = Op.getConstantOperandVal(10);
- unsigned IdxEn = getIdxEn(Op.getOperand(3));
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // rsrc
- Op.getOperand(3), // vindex
- Op.getOperand(4), // voffset
- SOffset, // soffset
- Op.getOperand(6), // offset
- DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
- };
-
- if (LoadVT.getScalarType() == MVT::f16)
- return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
- M, DAG, Ops);
- return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
- Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
- DAG);
- }
case Intrinsic::amdgcn_raw_tbuffer_load:
case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
@@ -8908,94 +9011,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
DAG);
}
- case Intrinsic::amdgcn_buffer_atomic_swap:
- case Intrinsic::amdgcn_buffer_atomic_add:
- case Intrinsic::amdgcn_buffer_atomic_sub:
- case Intrinsic::amdgcn_buffer_atomic_csub:
- case Intrinsic::amdgcn_buffer_atomic_smin:
- case Intrinsic::amdgcn_buffer_atomic_umin:
- case Intrinsic::amdgcn_buffer_atomic_smax:
- case Intrinsic::amdgcn_buffer_atomic_umax:
- case Intrinsic::amdgcn_buffer_atomic_and:
- case Intrinsic::amdgcn_buffer_atomic_or:
- case Intrinsic::amdgcn_buffer_atomic_xor:
- case Intrinsic::amdgcn_buffer_atomic_fadd: {
- unsigned Slc = Op.getConstantOperandVal(6);
- unsigned IdxEn = getIdxEn(Op.getOperand(4));
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // vdata
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- SDValue(), // voffset -- will be set by setBufferOffsets
- SDValue(), // soffset -- will be set by setBufferOffsets
- SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
-
- EVT VT = Op.getValueType();
-
- auto *M = cast<MemSDNode>(Op);
- unsigned Opcode = 0;
-
- switch (IntrID) {
- case Intrinsic::amdgcn_buffer_atomic_swap:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
- break;
- case Intrinsic::amdgcn_buffer_atomic_add:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
- break;
- case Intrinsic::amdgcn_buffer_atomic_sub:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
- break;
- case Intrinsic::amdgcn_buffer_atomic_csub:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB;
- break;
- case Intrinsic::amdgcn_buffer_atomic_smin:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
- break;
- case Intrinsic::amdgcn_buffer_atomic_umin:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
- break;
- case Intrinsic::amdgcn_buffer_atomic_smax:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
- break;
- case Intrinsic::amdgcn_buffer_atomic_umax:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
- break;
- case Intrinsic::amdgcn_buffer_atomic_and:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
- break;
- case Intrinsic::amdgcn_buffer_atomic_or:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
- break;
- case Intrinsic::amdgcn_buffer_atomic_xor:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
- break;
- case Intrinsic::amdgcn_buffer_atomic_fadd:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD;
- break;
- default:
- llvm_unreachable("unhandled atomic opcode");
- }
-
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
- M->getMemOperand());
- }
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
- case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
- return lowerRawBufferAtomicIntrin(Op, DAG,
- AMDGPUISD::BUFFER_ATOMIC_FADD_BF16);
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
- case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
- return lowerStructBufferAtomicIntrin(Op, DAG,
- AMDGPUISD::BUFFER_ATOMIC_FADD_BF16);
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
@@ -9092,29 +9113,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return lowerStructBufferAtomicIntrin(Op, DAG,
AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
- case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
- unsigned Slc = Op.getConstantOperandVal(7);
- unsigned IdxEn = getIdxEn(Op.getOperand(5));
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // src
- Op.getOperand(3), // cmp
- Op.getOperand(4), // rsrc
- Op.getOperand(5), // vindex
- SDValue(), // voffset -- will be set by setBufferOffsets
- SDValue(), // soffset -- will be set by setBufferOffsets
- SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
-
- EVT VT = Op.getValueType();
- auto *M = cast<MemSDNode>(Op);
-
- return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
- Op->getVTList(), Ops, VT, M->getMemOperand());
- }
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
@@ -9313,22 +9311,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmin_num: {
- Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
+ Opcode = ISD::ATOMIC_LOAD_FMIN;
break;
}
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmax_num: {
- Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
+ Opcode = ISD::ATOMIC_LOAD_FMAX;
break;
}
default:
llvm_unreachable("unhandled atomic opcode");
}
- return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
- M->getVTList(), Ops, M->getMemoryVT(),
- M->getMemOperand());
+ return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
+ Ops, M->getMemOperand());
}
case Intrinsic::amdgcn_s_get_barrier_state: {
SDValue Chain = Op->getOperand(0);
@@ -9557,34 +9554,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue();
};
- case Intrinsic::amdgcn_tbuffer_store: {
- SDValue VData = Op.getOperand(2);
- bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
- if (IsD16)
- VData = handleD16VData(VData, DAG);
- unsigned Dfmt = Op.getConstantOperandVal(8);
- unsigned Nfmt = Op.getConstantOperandVal(9);
- unsigned Glc = Op.getConstantOperandVal(10);
- unsigned Slc = Op.getConstantOperandVal(11);
- unsigned IdxEn = getIdxEn(Op.getOperand(4));
- SDValue Ops[] = {
- Chain,
- VData, // vdata
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- Op.getOperand(5), // voffset
- Op.getOperand(6), // soffset
- Op.getOperand(7), // offset
- DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
- AMDGPUISD::TBUFFER_STORE_FORMAT;
- MemSDNode *M = cast<MemSDNode>(Op);
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
- M->getMemoryVT(), M->getMemOperand());
- }
case Intrinsic::amdgcn_struct_tbuffer_store:
case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
@@ -9642,42 +9611,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
M->getMemoryVT(), M->getMemOperand());
}
- case Intrinsic::amdgcn_buffer_store:
- case Intrinsic::amdgcn_buffer_store_format: {
- SDValue VData = Op.getOperand(2);
- bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
- if (IsD16)
- VData = handleD16VData(VData, DAG);
- unsigned Glc = Op.getConstantOperandVal(6);
- unsigned Slc = Op.getConstantOperandVal(7);
- unsigned IdxEn = getIdxEn(Op.getOperand(4));
- SDValue Ops[] = {
- Chain,
- VData,
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- SDValue(), // voffset -- will be set by setBufferOffsets
- SDValue(), // soffset -- will be set by setBufferOffsets
- SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
-
- unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
- AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
- Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
- MemSDNode *M = cast<MemSDNode>(Op);
-
- // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
- EVT VDataType = VData.getValueType().getScalarType();
- if (VDataType == MVT::i8 || VDataType == MVT::i16)
- return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
-
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
- M->getMemoryVT(), M->getMemOperand());
- }
-
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_raw_ptr_buffer_store:
case Intrinsic::amdgcn_raw_buffer_store_format:
@@ -10083,8 +10016,8 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
return {N0, SDValue(C1, 0)};
}
-// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
-// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
+// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
+// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
SelectionDAG &DAG, SDValue *Offsets,
@@ -10215,7 +10148,7 @@ SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
EVT VDataType, SDLoc DL,
SDValue Ops[],
MemSDNode *M) const {
- if (VDataType == MVT::f16)
+ if (VDataType == MVT::f16 || VDataType == MVT::bf16)
Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
@@ -16063,8 +15996,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
case ISD::INTRINSIC_W_CHAIN:
return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
case AMDGPUISD::ATOMIC_CMP_SWAP:
- case AMDGPUISD::ATOMIC_LOAD_FMIN:
- case AMDGPUISD::ATOMIC_LOAD_FMAX:
case AMDGPUISD::BUFFER_ATOMIC_SWAP:
case AMDGPUISD::BUFFER_ATOMIC_ADD:
case AMDGPUISD::BUFFER_ATOMIC_SUB:
@@ -16080,7 +16011,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
case AMDGPUISD::BUFFER_ATOMIC_CSUB:
case AMDGPUISD::BUFFER_ATOMIC_FADD:
- case AMDGPUISD::BUFFER_ATOMIC_FADD_BF16:
case AMDGPUISD::BUFFER_ATOMIC_FMIN:
case AMDGPUISD::BUFFER_ATOMIC_FMAX:
// Target-specific read-modify-write atomics are sources of divergence.
@@ -16173,6 +16103,26 @@ static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
<< " operation at memory scope " << MemScope;
}
+static bool isHalf2OrBFloat2(Type *Ty) {
+ if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
+ Type *EltTy = VT->getElementType();
+ return VT->getNumElements() == 2 &&
+ (EltTy->isHalfTy() || EltTy->isBFloatTy());
+ }
+
+ return false;
+}
+
+static bool isHalf2(Type *Ty) {
+ FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
+ return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
+}
+
+static bool isBFloat2(Type *Ty) {
+ FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
+ return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
@@ -16231,7 +16181,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
: AtomicExpansionKind::CmpXChg;
}
- // TODO: Handle v2f16/v2bf16 cases for gfx940
+ if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty))
+ return AtomicExpansionKind::None;
+
return AtomicExpansionKind::CmpXChg;
}
@@ -16239,10 +16191,36 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
AS != AMDGPUAS::BUFFER_FAT_POINTER)
return AtomicExpansionKind::CmpXChg;
- // TODO: gfx940 supports v2f16 and v2bf16
if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
return AtomicExpansionKind::None;
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
+ // gfx940, gfx12
+ // FIXME: Needs to account for no fine-grained memory
+ if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
+ // gfx90a, gfx940, gfx12
+ // FIXME: Needs to account for no fine-grained memory
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
+ return AtomicExpansionKind::None;
+
+ // gfx940, gfx12
+ // FIXME: Needs to account for no fine-grained memory
+ if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
+ // gfx90a, gfx940, gfx12
+ // FIXME: Needs to account for no fine-grained memory
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
+ return AtomicExpansionKind::None;
+
+ // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
+ // buffer. gfx12 does have the buffer version.
+ if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ }
+
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
return AtomicExpansionKind::CmpXChg;
@@ -16284,17 +16262,51 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AtomicExpansionKind::CmpXChg;
}
case AtomicRMWInst::FMin:
- case AtomicRMWInst::FMax:
+ case AtomicRMWInst::FMax: {
+ Type *Ty = RMW->getType();
+
+ // LDS float and double fmin/fmax were always supported.
+ if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
+ return AtomicExpansionKind::None;
+
+ if (unsafeFPAtomicsDisabled(RMW->getFunction()))
+ return AtomicExpansionKind::CmpXChg;
+
+ // Always expand system scope fp atomics.
+ if (HasSystemScope)
+ return AtomicExpansionKind::CmpXChg;
+
+ // For flat and global cases:
+ // float, double in gfx7. Manual claims denormal support.
+ // Removed in gfx8.
+ // float, double restored in gfx10.
+ // double removed again in gfx11, so only f32 for gfx11/gfx12.
+ //
+ // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
+ // f32.
+ //
+ // FIXME: Check scope and fine grained memory
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
+ if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
+ AS == AMDGPUAS::BUFFER_FAT_POINTER) {
+ if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ }
+
+ return AtomicExpansionKind::CmpXChg;
+ }
case AtomicRMWInst::Min:
case AtomicRMWInst::Max:
case AtomicRMWInst::UMin:
case AtomicRMWInst::UMax: {
if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
- if (RMW->getType()->isFloatTy() &&
- unsafeFPAtomicsDisabled(RMW->getFunction()))
- return AtomicExpansionKind::CmpXChg;
-
// Always expand system scope min/max atomics.
if (HasSystemScope)
return AtomicExpansionKind::CmpXChg;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 4c02bb1b306e..1f198a92c0fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -253,9 +253,9 @@ public:
bool shouldExpandVectorDynExt(SDNode *N) const;
private:
- // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
- // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
- // pointed to by Offsets.
+ // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
+ // the three offsets (voffset, soffset and instoffset) into the SDValue[3]
+ // array pointed to by Offsets.
void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
SDValue *Offsets, Align Alignment = Align(4)) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 230443313d72..4c53a081cdb2 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -641,7 +641,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<MachineLoopInfo>();
- AU.addRequired<MachinePostDominatorTree>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
AU.addUsedIfAvailable<AAResultsWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
@@ -1118,7 +1118,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)
@@ -2398,7 +2398,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MLI = &getAnalysis<MachineLoopInfo>();
- PDT = &getAnalysis<MachinePostDominatorTree>();
+ PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
AA = &AAR->getAAResults();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d8e21da8019a..cc1b9ac0c9ec 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2519,12 +2519,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
- case AMDGPU::ENTER_PSEUDO_WM:
- case AMDGPU::EXIT_PSEUDO_WM: {
- // These do nothing.
- MI.eraseFromParent();
- break;
- }
case AMDGPU::SI_RETURN: {
const MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -3978,7 +3972,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.add(*Dst)
.add(*Src0)
.add(*Src1)
- .addImm(Imm);
+ .addImm(Imm)
+ .setMIFlags(MI.getFlags());
updateLiveVariables(LV, MI, *MIB);
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *MIB);
@@ -3997,7 +3992,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.add(*Dst)
.add(*Src0)
.addImm(Imm)
- .add(*Src2);
+ .add(*Src2)
+ .setMIFlags(MI.getFlags());
updateLiveVariables(LV, MI, *MIB);
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *MIB);
@@ -4018,7 +4014,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.add(*Dst)
.add(*Src1)
.addImm(Imm)
- .add(*Src2);
+ .add(*Src2)
+ .setMIFlags(MI.getFlags());
updateLiveVariables(LV, MI, *MIB);
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *MIB);
@@ -4054,7 +4051,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.addImm(Src2Mods ? Src2Mods->getImm() : 0)
.add(*Src2)
.addImm(Clamp ? Clamp->getImm() : 0)
- .addImm(Omod ? Omod->getImm() : 0);
+ .addImm(Omod ? Omod->getImm() : 0)
+ .setMIFlags(MI.getFlags());
if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
MIB.addImm(OpSel ? OpSel->getImm() : 0);
updateLiveVariables(LV, MI, *MIB);
@@ -5657,24 +5655,9 @@ unsigned SIInstrInfo::buildExtractSubReg(
DebugLoc DL = MI->getDebugLoc();
Register SubReg = MRI.createVirtualRegister(SubRC);
- if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
- BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
- .addReg(SuperReg.getReg(), 0, SubIdx);
- return SubReg;
- }
-
- // Just in case the super register is itself a sub-register, copy it to a new
- // value so we don't need to worry about merging its subreg index with the
- // SubIdx passed to this function. The register coalescer should be able to
- // eliminate this extra copy.
- Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
-
- BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
- .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
-
+ unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
- .addReg(NewSuperReg, 0, SubIdx);
-
+ .addReg(SuperReg.getReg(), 0, NewSubIdx);
return SubReg;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 40289f2addfd..c64b3a7c356f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -72,14 +72,6 @@ def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
]>;
-def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32,
- [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
-def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
- [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
// load_d16_{lo|hi} ptr, tied_input
def SIload_d16 : SDTypeProfile<1, 2, [
SDTCisPtrTy<1>,
@@ -222,7 +214,6 @@ defm SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
defm SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
defm SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
defm SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
-defm SIbuffer_atomic_fadd_bf16 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD_BF16">;
defm SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
defm SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
defm SIbuffer_atomic_cond_sub_u32 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32">;
@@ -315,13 +306,6 @@ class isIntType<ValueType SrcVT> {
}
//===----------------------------------------------------------------------===//
-// PatFrags for global memory operations
-//===----------------------------------------------------------------------===//
-
-defm atomic_load_fmin : binary_atomic_op_all_as<SIatomic_fmin, 0>;
-defm atomic_load_fmax : binary_atomic_op_all_as<SIatomic_fmax, 0>;
-
-//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.
// This is for SDNodes and PatFrag for local loads and stores to
// enable s_mov_b32 m0, -1 to be glued to the memory instructions.
@@ -709,15 +693,24 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
>;
let AddressSpaces = StoreAddress_local.AddrSpaces in {
- defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
- defm _local_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
- IsInt>;
+
+ if IsInt then {
+ defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+ defm _local_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+ } else {
+ defm _local_m0 : binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>;
+ defm _local_m0 : noret_binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>;
+ }
}
let AddressSpaces = StoreAddress_region.AddrSpaces in {
- defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
- defm _region_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
- IsInt>;
+ if IsInt then {
+ defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+ defm _region_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+ } else {
+ defm _region_m0 : binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>;
+ defm _region_m0 : noret_binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>;
+ }
}
}
@@ -734,8 +727,8 @@ defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>;
-defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>;
-defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>;
+defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 0, SDTAtomic2_f32, 0>;
+defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 0, SDTAtomic2_f32, 0>;
def as_i1timm : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
@@ -2233,13 +2226,12 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
// Return an AGPR+VGPR operand class for the given VGPR register class.
class getLdStRegisterOperand<RegisterClass RC> {
RegisterOperand ret =
- !if(!eq(RC.Size, 32), AVLdSt_32,
- !if(!eq(RC.Size, 64), AVLdSt_64,
- !if(!eq(RC.Size, 96), AVLdSt_96,
- !if(!eq(RC.Size, 128), AVLdSt_128,
- !if(!eq(RC.Size, 160), AVLdSt_160,
- RegisterOperand<VReg_1> // invalid register
- )))));
+ !cond(!eq(RC.Size, 32) : AVLdSt_32,
+ !eq(RC.Size, 64) : AVLdSt_64,
+ !eq(RC.Size, 96) : AVLdSt_96,
+ !eq(RC.Size, 128) : AVLdSt_128,
+ !eq(RC.Size, 160) : AVLdSt_160,
+ !eq(RC.Size, 1024) : AVLdSt_1024);
}
class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32,
@@ -2271,6 +2263,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit EnableClamp = _EnableClamp;
field bit IsTrue16 = 0;
field bit IsRealTrue16 = 0;
+ field bit IsInvalidSingleUseConsumer = 0;
+ field bit IsInvalidSingleUseProducer = 0;
field ValueType DstVT = ArgVT[0];
field ValueType Src0VT = ArgVT[1];
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c1b844f844c3..835f44f9d0d6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -217,21 +217,6 @@ def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$m
def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>;
} // End usesCustomInserter = 1
-// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes.
-def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
- let Uses = [EXEC];
- let Defs = [EXEC];
- let hasSideEffects = 0;
- let mayLoad = 0;
- let mayStore = 0;
-}
-
-def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
- let hasSideEffects = 0;
- let mayLoad = 0;
- let mayStore = 0;
-}
-
// Pseudo instructions used for @llvm.fptrunc.round upward
// and @llvm.fptrunc.round downward.
// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD
@@ -252,16 +237,22 @@ def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
// restoring it after we're done.
let Defs = [SCC], isConvergent = 1 in {
def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
- (ins VSrc_b32: $src, VSrc_b32:$inactive),
- [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
-}
+ (ins VSrc_b32: $src, VSrc_b32:$inactive), []>;
def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
- (ins VSrc_b64: $src, VSrc_b64:$inactive),
- [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
-}
+ (ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
} // End Defs = [SCC]
+foreach vt = Reg32Types.types in {
+def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
+ (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
+}
+
+foreach vt = Reg64Types.types in {
+def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
+ (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
+}
+
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;
@@ -3398,7 +3389,7 @@ def : GCNPat<
// FIXME: Should also do this for readlane, but tablegen crashes on
// the ignored src1.
def : GCNPat<
- (int_amdgcn_readfirstlane (i32 imm:$src)),
+ (i32 (int_amdgcn_readfirstlane (i32 imm:$src))),
(S_MOV_B32 SReg_32:$src)
>;
@@ -3872,11 +3863,6 @@ def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
let mayStore = 1;
}
-let Namespace = "AMDGPU" in {
-def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP;
-def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP;
-}
-
class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
@@ -3901,7 +3887,6 @@ def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
-def G_AMDGPU_BUFFER_ATOMIC_FADD_BF16 : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction;
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index abb72e8e63c3..afc6353ec811 100644
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -48,8 +48,8 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -60,7 +60,7 @@ char SILateBranchLowering::ID = 0;
INITIALIZE_PASS_BEGIN(SILateBranchLowering, DEBUG_TYPE,
"SI insert s_cbranch_execz instructions", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(SILateBranchLowering, DEBUG_TYPE,
"SI insert s_cbranch_execz instructions", false, false)
@@ -149,7 +149,7 @@ bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 5dc3457b5bfa..75a1575f2180 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -149,7 +149,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addUsedIfAvailable<LiveIntervals>();
// Should preserve the same set that TwoAddressInstructions does.
- AU.addPreserved<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
AU.addPreservedID(LiveVariablesID);
@@ -764,7 +764,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
LIS = getAnalysisIfAvailable<LiveIntervals>();
// This doesn't actually need LiveVariables, but we can preserve them.
LV = getAnalysisIfAvailable<LiveVariables>();
- MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+ auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+ MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
MRI = &MF.getRegInfo();
BoolRC = TRI->getBoolRC();
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 32dad0c425c0..a9ee74dec120 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -51,8 +51,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
- AU.addRequired<MachinePostDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -399,8 +399,8 @@ private:
INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
false)
@@ -445,8 +445,9 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
MachineFunctionProperties::Property::Selected))
return false;
- Vreg1LoweringHelper Helper(&TheMF, &getAnalysis<MachineDominatorTree>(),
- &getAnalysis<MachinePostDominatorTree>());
+ Vreg1LoweringHelper Helper(
+ &TheMF, &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(),
+ &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree());
bool Changed = false;
Changed |= Helper.lowerCopiesFromI1();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 072c5aedc220..d9db0f7a4f53 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -83,7 +83,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
if (CC != CallingConv::AMDGPU_Gfx)
ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
- // TODO: Pick a high register, and shift down, similar to a kernel.
FrameOffsetReg = AMDGPU::SGPR33;
StackPtrOffsetReg = AMDGPU::SGPR32;
@@ -233,6 +232,12 @@ Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
return ArgInfo.FlatScratchInit.getRegister();
}
+Register SIMachineFunctionInfo::addPrivateSegmentSize(const SIRegisterInfo &TRI) {
+ ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(getNextUserSGPR());
+ NumUserSGPRs += 1;
+ return ArgInfo.PrivateSegmentSize.getRegister();
+}
+
Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 9fe02e24c8a1..7af5e7388f84 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -752,6 +752,7 @@ public:
Register addKernargSegmentPtr(const SIRegisterInfo &TRI);
Register addDispatchID(const SIRegisterInfo &TRI);
Register addFlatScratchInit(const SIRegisterInfo &TRI);
+ Register addPrivateSegmentSize(const SIRegisterInfo &TRI);
Register addImplicitBufferPtr(const SIRegisterInfo &TRI);
Register addLDSKernelId();
SmallVectorImpl<MCRegister> *
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index 8204a70e72d9..18d66e419152 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -148,10 +148,10 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveVariables>();
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfo>();
AU.addPreserved<LiveVariables>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addPreserved<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -618,7 +618,7 @@ char SIOptimizeVGPRLiveRange::ID = 0;
INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
"SI Optimize VGPR LiveRange", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_DEPENDENCY(LiveVariables)
INITIALIZE_PASS_END(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
@@ -635,7 +635,7 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
Loops = &getAnalysis<MachineLoopInfo>();
LV = &getAnalysis<LiveVariables>();
MRI = &MF.getRegInfo();
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 1fadd8ce45b1..f47731bf6aac 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -37,20 +37,22 @@ STATISTIC(NumSDWAInstructionsPeepholed,
namespace {
+bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
+ const SIInstrInfo *TII);
class SDWAOperand;
class SDWADstOperand;
-class SIPeepholeSDWA : public MachineFunctionPass {
-public:
- using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
+using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
+using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;
+class SIPeepholeSDWA : public MachineFunctionPass {
private:
MachineRegisterInfo *MRI;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
- MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches;
+ SDWAOperandsMap PotentialMatches;
SmallVector<MachineInstr *, 8> ConvertedInstructions;
std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
@@ -65,7 +67,6 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineBasicBlock &MBB);
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
- bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
void pseudoOpConvertToVOP2(MachineInstr &MI,
const GCNSubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
@@ -93,7 +94,9 @@ public:
virtual ~SDWAOperand() = default;
- virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
+ virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) = 0;
virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
MachineOperand *getTargetOperand() const { return Target; }
@@ -126,7 +129,9 @@ public:
: SDWAOperand(TargetOp, ReplacedOp),
SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
- MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+ MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) override;
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
SdwaSel getSrcSel() const { return SrcSel; }
@@ -153,7 +158,9 @@ public:
SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
: SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
- MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+ MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) override;
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
SdwaSel getDstSel() const { return DstSel; }
@@ -327,7 +334,33 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
return Mods;
}
-MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
+MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches) {
+ if (PotentialMatches != nullptr) {
+ // Fill out the map for all uses if all can be converted
+ MachineOperand *Reg = getReplacedOperand();
+ if (!Reg->isReg() || !Reg->isDef())
+ return nullptr;
+
+ for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg()))
+ // Check that all instructions that use Reg can be converted
+ if (!isConvertibleToSDWA(UseMI, ST, TII))
+ return nullptr;
+
+ // Now that it's guaranteed all uses are legal, iterate over the uses again
+ // to add them for later conversion.
+ for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
+ // Should not get a subregister here
+ assert(isSameReg(UseMO, *Reg));
+
+ SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
+ MachineInstr *UseMI = UseMO.getParent();
+ potentialMatchesMap[UseMI].push_back(this);
+ }
+ return nullptr;
+ }
+
// For SDWA src operand potential instruction is one that use register
// defined by parent instruction
MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
@@ -420,7 +453,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
return true;
}
-MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
+MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches) {
// For SDWA dst operand potential instruction is one that defines register
// that this operand uses
MachineRegisterInfo *MRI = getMRI();
@@ -919,8 +954,10 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
}
-bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
- const GCNSubtarget &ST) const {
+namespace {
+bool isConvertibleToSDWA(MachineInstr &MI,
+ const GCNSubtarget &ST,
+ const SIInstrInfo* TII) {
// Check if this is already an SDWA instruction
unsigned Opc = MI.getOpcode();
if (TII->isSDWA(Opc))
@@ -980,6 +1017,7 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
return true;
}
+} // namespace
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
const SDWAOperandsVector &SDWAOperands) {
@@ -1215,7 +1253,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
matchSDWAOperands(MBB);
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
- MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
if (PotentialMI &&
(PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
@@ -1228,8 +1266,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
- MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
- if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches);
+ if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) {
PotentialMatches[PotentialMI].push_back(Operand.get());
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 398f870a9f53..5837dbeb3f98 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -165,19 +165,15 @@ SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
- if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM ||
- Opc == AMDGPU::ENTER_PSEUDO_WM) {
+ if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) {
dbgs() << "Entering ";
} else {
- assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM ||
- Opc == AMDGPU::EXIT_PSEUDO_WM);
+ assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM);
dbgs() << "Exiting ";
}
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) {
dbgs() << "Strict WWM ";
- } else if (Opc == AMDGPU::ENTER_PSEUDO_WM || Opc == AMDGPU::EXIT_PSEUDO_WM) {
- dbgs() << "Pseudo WWM/WQM ";
} else {
assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM);
dbgs() << "Strict WQM ";
@@ -230,16 +226,14 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
}
if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
- MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM ||
- MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) {
+ MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) {
LLVM_DEBUG(printWWMInfo(MI));
InWWM = true;
continue;
}
if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM ||
- MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM ||
- MI.getOpcode() == AMDGPU::EXIT_PSEUDO_WM) {
+ MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) {
LLVM_DEBUG(printWWMInfo(MI));
InWWM = false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 0d40816cdd4b..212edff09783 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -161,45 +161,6 @@ static const MCExpr *MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift,
return Val;
}
-uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const {
- int64_t VBlocks, SBlocks;
- VGPRBlocks->evaluateAsAbsolute(VBlocks);
- SGPRBlocks->evaluateAsAbsolute(SBlocks);
-
- uint64_t Reg = S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) |
- S_00B848_SGPRS(static_cast<uint64_t>(SBlocks)) |
- getComputePGMRSrc1Reg(*this, ST);
-
- return Reg;
-}
-
-uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
- const GCNSubtarget &ST) const {
- if (AMDGPU::isCompute(CC)) {
- return getComputePGMRSrc1(ST);
- }
- int64_t VBlocks, SBlocks;
- VGPRBlocks->evaluateAsAbsolute(VBlocks);
- SGPRBlocks->evaluateAsAbsolute(SBlocks);
-
- return getPGMRSrc1Reg(*this, CC, ST) |
- S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) |
- S_00B848_SGPRS(static_cast<uint64_t>(SBlocks));
-}
-
-uint64_t SIProgramInfo::getComputePGMRSrc2() const {
- int64_t ScratchEn;
- ScratchEnable->evaluateAsAbsolute(ScratchEn);
- return ScratchEn | getComputePGMRSrc2Reg(*this);
-}
-
-uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const {
- if (AMDGPU::isCompute(CC))
- return getComputePGMRSrc2();
-
- return 0;
-}
-
const MCExpr *SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST,
MCContext &Ctx) const {
uint64_t Reg = getComputePGMRSrc1Reg(*this, ST);
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index e66e5a194c8b..c358a2d9db10 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -98,16 +98,12 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
void reset(const MachineFunction &MF);
/// Compute the value of the ComputePGMRsrc1 register.
- uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const;
- uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const;
const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
MCContext &Ctx) const;
const MCExpr *getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST,
MCContext &Ctx) const;
/// Compute the value of the ComputePGMRsrc2 register.
- uint64_t getComputePGMRSrc2() const;
- uint64_t getPGMRSrc2(CallingConv::ID CC) const;
const MCExpr *getComputePGMRSrc2(MCContext &Ctx) const;
const MCExpr *getPGMRSrc2(CallingConv::ID CC, MCContext &Ctx) const;
};
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 4b5f9bdd82b8..4c5e60c873bb 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3157,7 +3157,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
MachineInstr &Use,
MachineRegisterInfo &MRI,
LiveIntervals *LIS) const {
- auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
+ auto &MDT = LIS->getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
SlotIndex UseIdx = LIS->getInstructionIndex(Use);
SlotIndex DefIdx;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index caac7126068e..f1d9aec16363 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -586,7 +586,9 @@ class RegisterTypes<list<ValueType> reg_types> {
def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
-def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0]>;
+def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, v4i16, v4f16, v4bf16]>;
+def Reg96Types : RegisterTypes<[v3i32, v3f32]>;
+def Reg128Types : RegisterTypes<[v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16]>;
let HasVGPR = 1 in {
// VOP3 and VINTERP can access 256 lo and 256 hi registers.
@@ -744,7 +746,7 @@ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16,
let BaseClassOrder = 10000;
}
-def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16, v8bf16], 32,
+def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", Reg128Types.types, 32,
(add PRIVATE_RSRC_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
@@ -815,7 +817,7 @@ def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v
let HasSGPR = 1;
}
-def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4bf16], 32,
+def SGPR_64 : SIRegisterClass<"AMDGPU", Reg64Types.types, 32,
(add SGPR_64Regs)> {
let CopyCost = 1;
let AllocationPriority = 1;
@@ -905,8 +907,8 @@ multiclass SRegClass<int numRegs,
}
}
-defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
-defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<3, Reg96Types.types, SGPR_96Regs, TTMP_96Regs>;
+defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs>;
defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
@@ -958,8 +960,8 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4bf16, v4i16, p0, p1, p4],
(add VGPR_64)>;
-defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
-defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add VGPR_128)>;
+defm VReg_96 : VRegClass<3, Reg96Types.types, (add VGPR_96)>;
+defm VReg_128 : VRegClass<4, Reg128Types.types, (add VGPR_128)>;
defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
@@ -1342,6 +1344,7 @@ def AVLdSt_64 : AVLdStOperand<AV_64, "OPW64">;
def AVLdSt_96 : AVLdStOperand<AV_96, "OPW96">;
def AVLdSt_128 : AVLdStOperand<AV_128, "OPW128">;
def AVLdSt_160 : AVLdStOperand<AV_160, "OPW160">;
+def AVLdSt_1024 : AVLdStOperand<AV_1024, "OPW1024">;
//===----------------------------------------------------------------------===//
// ACSrc_* Operands with an AGPR or an inline constant
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 647fae904d39..79bcf5e8cd30 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -45,7 +45,6 @@ public:
bool isKImmOperand(const MachineOperand &Src) const;
bool isKUImmOperand(const MachineOperand &Src) const;
bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
- bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const;
void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
void shrinkScalarCompare(MachineInstr &MI) const;
void shrinkMIMG(MachineInstr &MI) const;
@@ -183,15 +182,36 @@ bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
return false;
}
-/// \returns true if the constant in \p Src should be replaced with a bitreverse
-/// of an inline immediate.
-bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src,
- int32_t &ReverseImm) const {
- if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
- return false;
+/// \returns the opcode of an instruction a move immediate of the constant \p
+/// Src can be replaced with if the constant is replaced with \p ModifiedImm.
+/// i.e.
+///
+/// If the bitreverse of a constant is an inline immediate, reverse the
+/// immediate and return the bitreverse opcode.
+///
+/// If the bitwise negation of a constant is an inline immediate, reverse the
+/// immediate and return the bitwise not opcode.
+static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII,
+ const MachineOperand &Src,
+ int32_t &ModifiedImm, bool Scalar) {
+ if (TII->isInlineConstant(Src))
+ return 0;
+ int32_t SrcImm = static_cast<int32_t>(Src.getImm());
+
+ if (!Scalar) {
+ // We could handle the scalar case with here, but we would need to check
+ // that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth
+ // it, as the reasonable values are already covered by s_movk_i32.
+ ModifiedImm = ~SrcImm;
+ if (TII->isInlineConstant(APInt(32, ModifiedImm)))
+ return AMDGPU::V_NOT_B32_e32;
+ }
+
+ ModifiedImm = reverseBits<int32_t>(SrcImm);
+ if (TII->isInlineConstant(APInt(32, ModifiedImm)))
+ return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32;
- ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
- return ReverseImm >= -16 && ReverseImm <= 64;
+ return 0;
}
/// Copy implicit register operands from specified instruction to this
@@ -801,10 +821,12 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// XXX - not exactly a check for post-regalloc run.
MachineOperand &Src = MI.getOperand(1);
if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) {
- int32_t ReverseImm;
- if (isReverseInlineImm(Src, ReverseImm)) {
- MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
- Src.setImm(ReverseImm);
+ int32_t ModImm;
+ unsigned ModOpcode =
+ canModifyToInlineImmOp32(TII, Src, ModImm, /*Scalar=*/false);
+ if (ModOpcode != 0) {
+ MI.setDesc(TII->get(ModOpcode));
+ Src.setImm(static_cast<int64_t>(ModImm));
continue;
}
}
@@ -863,13 +885,15 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MachineOperand &Src = MI.getOperand(1);
if (Src.isImm() && Dst.getReg().isPhysical()) {
- int32_t ReverseImm;
+ unsigned ModOpc;
+ int32_t ModImm;
if (isKImmOperand(Src)) {
MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
Src.setImm(SignExtend64(Src.getImm(), 32));
- } else if (isReverseInlineImm(Src, ReverseImm)) {
- MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
- Src.setImm(ReverseImm);
+ } else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModImm,
+ /*Scalar=*/true))) {
+ MI.setDesc(TII->get(ModOpc));
+ Src.setImm(static_cast<int64_t>(ModImm));
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 913942dda19d..742fd397ff9e 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -215,8 +215,6 @@ private:
MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
bool IsWQM);
MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
- void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
- MachineInstr *Exit);
void lowerBlock(MachineBasicBlock &MBB);
void processBlock(MachineBasicBlock &MBB, bool IsEntry);
@@ -241,8 +239,8 @@ public:
AU.addRequired<LiveIntervals>();
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
- AU.addPreserved<MachineDominatorTree>();
- AU.addPreserved<MachinePostDominatorTree>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -259,8 +257,8 @@ char SIWholeQuadMode::ID = 0;
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
false)
@@ -785,7 +783,7 @@ MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
if (MDT)
MDT->getBase().applyUpdates(DTUpdates);
if (PDT)
- PDT->getBase().applyUpdates(DTUpdates);
+ PDT->applyUpdates(DTUpdates);
// Link blocks
MachineInstr *MI =
@@ -1025,31 +1023,6 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
return NewTerm;
}
-// Convert a strict mode transition to a pseudo transition.
-// This still pre-allocates registers to prevent clobbering,
-// but avoids any EXEC mask changes.
-void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
- MachineInstr *Entry,
- MachineInstr *Exit) {
- assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
- assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
-
- Register SaveOrig = Entry->getOperand(0).getReg();
-
- MachineInstr *NewEntry =
- BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
- MachineInstr *NewExit =
- BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
-
- LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
- Exit->eraseFromParent();
-
- LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
- Entry->eraseFromParent();
-
- LIS->removeInterval(SaveOrig);
-}
-
// Replace (or supplement) instructions accessing live mask.
// This can only happen once all the live mask registers have been created
// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
@@ -1066,12 +1039,9 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
SmallVector<MachineInstr *, 4> SplitPoints;
char State = BI.InitialState;
- MachineInstr *StrictEntry = nullptr;
for (MachineInstr &MI : llvm::make_early_inc_range(
llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
- char PreviousState = State;
-
if (StateTransition.count(&MI))
State = StateTransition[&MI];
@@ -1084,20 +1054,6 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
SplitPoint = lowerKillF32(MBB, MI);
break;
- case AMDGPU::ENTER_STRICT_WQM:
- StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
- break;
- case AMDGPU::EXIT_STRICT_WQM:
- if (State == StateWQM && StrictEntry) {
- // Transition WQM -> StrictWQM -> WQM detected.
- lowerPseudoStrictMode(MBB, StrictEntry, &MI);
- }
- StrictEntry = nullptr;
- break;
- case AMDGPU::ENTER_STRICT_WWM:
- case AMDGPU::EXIT_STRICT_WWM:
- StrictEntry = nullptr;
- break;
default:
break;
}
@@ -1251,11 +1207,6 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
}
LIS->InsertMachineInstrInMaps(*MI);
StateTransition[MI] = StrictStateNeeded;
-
- // Mark block as needing lower so it will be checked for unnecessary transitions.
- auto BII = Blocks.find(&MBB);
- if (BII != Blocks.end())
- BII->second.NeedsLowering = true;
}
void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
@@ -1687,8 +1638,11 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();
- MDT = getAnalysisIfAvailable<MachineDominatorTree>();
- PDT = getAnalysisIfAvailable<MachinePostDominatorTree>();
+ auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+ MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
+ auto *PDTWrapper =
+ getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
+ PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
if (ST->isWave32()) {
AndOpc = AMDGPU::S_AND_B32;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index aee518680a60..64f33199545a 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -215,6 +215,11 @@ let isMoveImm = 1 in {
} // End Uses = [SCC]
} // End isMoveImm = 1
+// Variant of S_MOV_B32 used for reading from volatile registers like
+// SRC_POPS_EXITING_WAVE_ID.
+let hasSideEffects = 1 in
+def S_MOV_B32_sideeffects : SOP1_32 <"s_mov_b32">;
+
let Defs = [SCC] in {
def S_NOT_B32 : SOP1_32 <"s_not_b32",
[(set i32:$sdst, (UniformUnaryFrag<not> i32:$src0))]
@@ -1196,11 +1201,15 @@ let SubtargetPredicate = isGFX9Plus in {
}
} // End SubtargetPredicate = isGFX9Plus
+def VersionImm : S16ImmOperand {
+ let DecoderMethod = "decodeVersionImm";
+}
+
let SubtargetPredicate = isGFX10Plus in {
def S_VERSION : SOPK_Pseudo<
"s_version",
(outs),
- (ins s16imm:$simm16),
+ (ins VersionImm:$simm16),
"$simm16"> {
let has_sdst = 0;
}
@@ -1876,6 +1885,12 @@ let SubtargetPredicate = isNotGFX9Plus in {
def : GetFPModePat<fpmode_mask_gfx6plus>;
}
+let SubtargetPredicate = isGFX9GFX10 in
+def : GCNPat<
+ (int_amdgcn_pops_exiting_wave_id),
+ (S_MOV_B32_sideeffects (i32 SRC_POPS_EXITING_WAVE_ID))
+>;
+
//===----------------------------------------------------------------------===//
// SOP2 Patterns
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 2e1db1665b9c..3af536dac473 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -669,5 +669,20 @@ const char* const IdSymbolic[] = {
} // namespace VGPRIndexMode
+namespace UCVersion {
+
+ArrayRef<GFXVersion> getGFXVersions() {
+ // GFX6, GFX8 and GFX9 don't support s_version and there are no
+ // UC_VERSION_GFX* codes for them.
+ static const GFXVersion Versions[] = {{"UC_VERSION_GFX7", 0},
+ {"UC_VERSION_GFX10", 4},
+ {"UC_VERSION_GFX11", 6},
+ {"UC_VERSION_GFX12", 9}};
+
+ return Versions;
+}
+
+} // namespace UCVersion
+
} // namespace AMDGPU
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index 069134a7ae7f..c84c1a7dc18c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -116,6 +116,17 @@ extern const char* const IdSymbolic[];
} // namespace VGPRIndexMode
+namespace UCVersion {
+
+struct GFXVersion {
+ StringLiteral Symbol;
+ unsigned Code;
+};
+
+ArrayRef<GFXVersion> getGFXVersions();
+
+} // namespace UCVersion
+
} // namespace AMDGPU
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4b34fb27632a..9886235121d2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -159,6 +159,12 @@ namespace llvm {
namespace AMDGPU {
+/// \returns true if the target supports signed immediate offset for SMRD
+/// instructions.
+bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
+ return isGFX9Plus(ST);
+}
+
/// \returns True if \p STI is AMDHSA.
bool isHsaAbi(const MCSubtargetInfo &STI) {
return STI.getTargetTriple().getOS() == Triple::AMDHSA;
@@ -373,10 +379,18 @@ struct VOPTrue16Info {
bool IsTrue16;
};
+struct SingleUseExceptionInfo {
+ uint16_t Opcode;
+ bool IsInvalidSingleUseConsumer;
+ bool IsInvalidSingleUseProducer;
+};
+
#define GET_MTBUFInfoTable_DECL
#define GET_MTBUFInfoTable_IMPL
#define GET_MUBUFInfoTable_DECL
#define GET_MUBUFInfoTable_IMPL
+#define GET_SingleUseExceptionTable_DECL
+#define GET_SingleUseExceptionTable_IMPL
#define GET_SMInfoTable_DECL
#define GET_SMInfoTable_IMPL
#define GET_VOP1InfoTable_DECL
@@ -582,9 +596,7 @@ bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
}
bool isGenericAtomic(unsigned Opc) {
- return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN ||
- Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX ||
- Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP ||
+ return Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN ||
@@ -608,6 +620,16 @@ bool isTrue16Inst(unsigned Opc) {
return Info ? Info->IsTrue16 : false;
}
+bool isInvalidSingleUseConsumerInst(unsigned Opc) {
+ const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc);
+ return Info && Info->IsInvalidSingleUseConsumer;
+}
+
+bool isInvalidSingleUseProducerInst(unsigned Opc) {
+ const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc);
+ return Info && Info->IsInvalidSingleUseProducer;
+}
+
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
return Info ? Info->Opcode3Addr : ~0u;
@@ -2803,10 +2825,6 @@ static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) {
return isGCN3Encoding(ST) || isGFX10Plus(ST);
}
-static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
- return isGFX9Plus(ST);
-}
-
bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
int64_t EncodedOffset) {
if (isGFX12Plus(ST))
@@ -2841,7 +2859,14 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST,
}
std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
- int64_t ByteOffset, bool IsBuffer) {
+ int64_t ByteOffset, bool IsBuffer,
+ bool HasSOffset) {
+ // For unbuffered smem loads, it is illegal for the Immediate Offset to be
+ // negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
+ // Handle case where SOffset is not present.
+ if (!IsBuffer && !HasSOffset && ByteOffset < 0 && hasSMRDSignedImmOffset(ST))
+ return std::nullopt;
+
if (isGFX12Plus(ST)) // 24 bit signed offsets
return isInt<24>(ByteOffset) ? std::optional<int64_t>(ByteOffset)
: std::nullopt;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index cf8236b8e23b..af2f0bc1a630 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -360,6 +360,10 @@ struct EncodingField {
static ValueType decode(uint64_t Encoded) { return Encoded; }
};
+// Represents a single bit in an encoded value.
+template <unsigned Bit, unsigned D = 0>
+using EncodingBit = EncodingField<Bit, Bit, D>;
+
// A helper for encoding and decoding multiple fields.
template <typename... Fields> struct EncodingFields {
static constexpr uint64_t encode(Fields... Values) {
@@ -857,6 +861,12 @@ LLVM_READONLY
bool isTrue16Inst(unsigned Opc);
LLVM_READONLY
+bool isInvalidSingleUseConsumerInst(unsigned Opc);
+
+LLVM_READONLY
+bool isInvalidSingleUseProducerInst(unsigned Opc);
+
+LLVM_READONLY
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc);
LLVM_READONLY
@@ -1297,6 +1307,7 @@ bool hasVOPD(const MCSubtargetInfo &STI);
bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI);
int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
unsigned hasKernargPreload(const MCSubtargetInfo &STI);
+bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST);
/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
@@ -1469,7 +1480,8 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset);
/// S_LOAD instructions have a signed offset, on other subtargets it is
/// unsigned. S_BUFFER has an unsigned offset for all subtargets.
std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
- int64_t ByteOffset, bool IsBuffer);
+ int64_t ByteOffset, bool IsBuffer,
+ bool HasSOffset = false);
/// \return The encoding that can be used for a 32-bit literal offset in an SMRD
/// instruction. This is only useful on CI.s
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp
new file mode 100644
index 000000000000..a4f4a9ed5da4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp
@@ -0,0 +1,61 @@
+//===- AMDGPUDelayedMCExpr.cpp - Delayed MCExpr resolve ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUDelayedMCExpr.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+using namespace llvm;
+
+static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type,
+ MCValue Val) {
+ msgpack::Document *Doc = DN.getDocument();
+ switch (Type) {
+ default:
+ return Doc->getEmptyNode();
+ case msgpack::Type::Int:
+ return Doc->getNode(static_cast<int64_t>(Val.getConstant()));
+ case msgpack::Type::UInt:
+ return Doc->getNode(static_cast<uint64_t>(Val.getConstant()));
+ case msgpack::Type::Boolean:
+ return Doc->getNode(static_cast<bool>(Val.getConstant()));
+ }
+}
+
+void DelayedMCExprs::assignDocNode(msgpack::DocNode &DN, msgpack::Type Type,
+ const MCExpr *ExprValue) {
+ MCValue Res;
+ if (ExprValue->evaluateAsRelocatable(Res, nullptr, nullptr)) {
+ if (Res.isAbsolute()) {
+ DN = getNode(DN, Type, Res);
+ return;
+ }
+ }
+
+ DelayedExprs.push_back(Expr{DN, Type, ExprValue});
+}
+
+bool DelayedMCExprs::resolveDelayedExpressions() {
+ while (!DelayedExprs.empty()) {
+ Expr DE = DelayedExprs.front();
+ MCValue Res;
+
+ if (!DE.ExprValue->evaluateAsRelocatable(Res, nullptr, nullptr) ||
+ !Res.isAbsolute())
+ return false;
+
+ DelayedExprs.pop_front();
+ DE.DN = getNode(DE.DN, DE.Type, Res);
+ }
+
+ return true;
+}
+
+void DelayedMCExprs::clear() { DelayedExprs.clear(); }
+
+bool DelayedMCExprs::empty() { return DelayedExprs.empty(); }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.h
new file mode 100644
index 000000000000..8c9cda3a1bdd
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.h
@@ -0,0 +1,39 @@
+//===- AMDGPUDelayedMCExpr.h - Delayed MCExpr resolve -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUDELAYEDMCEXPR_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUDELAYEDMCEXPR_H
+
+#include "llvm/BinaryFormat/MsgPackDocument.h"
+#include <deque>
+
+namespace llvm {
+class MCExpr;
+
+class DelayedMCExprs {
+ struct Expr {
+ msgpack::DocNode &DN;
+ msgpack::Type Type;
+ const MCExpr *ExprValue;
+ Expr(msgpack::DocNode &DN, msgpack::Type Type, const MCExpr *ExprValue)
+ : DN(DN), Type(Type), ExprValue(ExprValue) {}
+ };
+
+ std::deque<Expr> DelayedExprs;
+
+public:
+ bool resolveDelayedExpressions();
+ void assignDocNode(msgpack::DocNode &DN, msgpack::Type Type,
+ const MCExpr *ExprValue);
+ void clear();
+ bool empty();
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUDELAYEDMCEXPR_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index 0fa67c559cb2..a53bf70d7771 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -20,6 +20,7 @@
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Module.h"
+#include "llvm/MC/MCExpr.h"
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/EndianStream.h"
@@ -137,12 +138,22 @@ void AMDGPUPALMetadata::setRsrc1(CallingConv::ID CC, unsigned Val) {
setRegister(getRsrc1Reg(CC), Val);
}
+void AMDGPUPALMetadata::setRsrc1(CallingConv::ID CC, const MCExpr *Val,
+ MCContext &Ctx) {
+ setRegister(getRsrc1Reg(CC), Val, Ctx);
+}
+
// Set the rsrc2 register in the metadata for a particular shader stage.
// In fact this ORs the value into any previous setting of the register.
void AMDGPUPALMetadata::setRsrc2(CallingConv::ID CC, unsigned Val) {
setRegister(getRsrc1Reg(CC) + 1, Val);
}
+void AMDGPUPALMetadata::setRsrc2(CallingConv::ID CC, const MCExpr *Val,
+ MCContext &Ctx) {
+ setRegister(getRsrc1Reg(CC) + 1, Val, Ctx);
+}
+
// Set the SPI_PS_INPUT_ENA register in the metadata.
// In fact this ORs the value into any previous setting of the register.
void AMDGPUPALMetadata::setSpiPsInputEna(unsigned Val) {
@@ -182,6 +193,40 @@ void AMDGPUPALMetadata::setRegister(unsigned Reg, unsigned Val) {
N = N.getDocument()->getNode(Val);
}
+// Set a register in the metadata.
+// In fact this ORs the value into any previous setting of the register.
+void AMDGPUPALMetadata::setRegister(unsigned Reg, const MCExpr *Val,
+ MCContext &Ctx) {
+ if (!isLegacy()) {
+ // In the new MsgPack format, ignore register numbered >= 0x10000000. It
+ // is a PAL ABI pseudo-register in the old non-MsgPack format.
+ if (Reg >= 0x10000000)
+ return;
+ }
+ auto &N = getRegisters()[MsgPackDoc.getNode(Reg)];
+ auto ExprIt = REM.find(Reg);
+
+ if (ExprIt != REM.end()) {
+ Val = MCBinaryExpr::createOr(Val, ExprIt->getSecond(), Ctx);
+ // This conditional may be redundant most of the time, but the alternate
+ // setRegister(unsigned, unsigned) could've been called while the
+ // conditional returns true (i.e., Reg exists in REM).
+ if (N.getKind() == msgpack::Type::UInt) {
+ const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx);
+ Val = MCBinaryExpr::createOr(Val, NExpr, Ctx);
+ }
+ ExprIt->getSecond() = Val;
+ } else if (N.getKind() == msgpack::Type::UInt) {
+ const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx);
+ Val = MCBinaryExpr::createOr(Val, NExpr, Ctx);
+ int64_t Unused;
+ if (!Val->evaluateAsAbsolute(Unused))
+ REM[Reg] = Val;
+ (void)Unused;
+ }
+ DelayedExprs.assignDocNode(N, msgpack::Type::UInt, Val);
+}
+
// Set the entry point name for one shader.
void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) {
if (isLegacy())
@@ -207,11 +252,29 @@ void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, unsigned Val) {
getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val);
}
+void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, const MCExpr *Val,
+ MCContext &Ctx) {
+ if (isLegacy()) {
+ // Old non-msgpack format.
+ unsigned NumUsedVgprsKey = getScratchSizeKey(CC) +
+ PALMD::Key::VS_NUM_USED_VGPRS -
+ PALMD::Key::VS_SCRATCH_SIZE;
+ setRegister(NumUsedVgprsKey, Val, Ctx);
+ return;
+ }
+ // Msgpack format.
+ setHwStage(CC, ".vgpr_count", msgpack::Type::UInt, Val);
+}
+
// Set the number of used agprs in the metadata.
void AMDGPUPALMetadata::setNumUsedAgprs(CallingConv::ID CC, unsigned Val) {
getHwStage(CC)[".agpr_count"] = Val;
}
+void AMDGPUPALMetadata::setNumUsedAgprs(unsigned CC, const MCExpr *Val) {
+ setHwStage(CC, ".agpr_count", msgpack::Type::UInt, Val);
+}
+
// Set the number of used sgprs in the metadata. This is an optional advisory
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of sgprs to allocate.
@@ -228,6 +291,20 @@ void AMDGPUPALMetadata::setNumUsedSgprs(CallingConv::ID CC, unsigned Val) {
getHwStage(CC)[".sgpr_count"] = MsgPackDoc.getNode(Val);
}
+void AMDGPUPALMetadata::setNumUsedSgprs(unsigned CC, const MCExpr *Val,
+ MCContext &Ctx) {
+ if (isLegacy()) {
+ // Old non-msgpack format.
+ unsigned NumUsedSgprsKey = getScratchSizeKey(CC) +
+ PALMD::Key::VS_NUM_USED_SGPRS -
+ PALMD::Key::VS_SCRATCH_SIZE;
+ setRegister(NumUsedSgprsKey, Val, Ctx);
+ return;
+ }
+ // Msgpack format.
+ setHwStage(CC, ".sgpr_count", msgpack::Type::UInt, Val);
+}
+
// Set the scratch size in the metadata.
void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) {
if (isLegacy()) {
@@ -239,6 +316,17 @@ void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) {
getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val);
}
+void AMDGPUPALMetadata::setScratchSize(unsigned CC, const MCExpr *Val,
+ MCContext &Ctx) {
+ if (isLegacy()) {
+ // Old non-msgpack format.
+ setRegister(getScratchSizeKey(CC), Val, Ctx);
+ return;
+ }
+ // Msgpack format.
+ setHwStage(CC, ".scratch_memory_size", msgpack::Type::UInt, Val);
+}
+
// Set the stack frame size of a function in the metadata.
void AMDGPUPALMetadata::setFunctionScratchSize(StringRef FnName, unsigned Val) {
auto Node = getShaderFunction(FnName);
@@ -259,6 +347,12 @@ void AMDGPUPALMetadata::setFunctionNumUsedVgprs(StringRef FnName,
Node[".vgpr_count"] = MsgPackDoc.getNode(Val);
}
+void AMDGPUPALMetadata::setFunctionNumUsedVgprs(StringRef FnName,
+ const MCExpr *Val) {
+ auto Node = getShaderFunction(FnName);
+ DelayedExprs.assignDocNode(Node[".vgpr_count"], msgpack::Type::UInt, Val);
+}
+
// Set the number of used vgprs in the metadata.
void AMDGPUPALMetadata::setFunctionNumUsedSgprs(StringRef FnName,
unsigned Val) {
@@ -266,6 +360,12 @@ void AMDGPUPALMetadata::setFunctionNumUsedSgprs(StringRef FnName,
Node[".sgpr_count"] = MsgPackDoc.getNode(Val);
}
+void AMDGPUPALMetadata::setFunctionNumUsedSgprs(StringRef FnName,
+ const MCExpr *Val) {
+ auto Node = getShaderFunction(FnName);
+ DelayedExprs.assignDocNode(Node[".sgpr_count"], msgpack::Type::UInt, Val);
+}
+
// Set the hardware register bit in PAL metadata to enable wave32 on the
// shader of the given calling convention.
void AMDGPUPALMetadata::setWave32(unsigned CC) {
@@ -662,6 +762,7 @@ void AMDGPUPALMetadata::toString(std::string &String) {
String.clear();
if (!BlobType)
return;
+ ResolvedAll = DelayedExprs.resolveDelayedExpressions();
raw_string_ostream Stream(String);
if (isLegacy()) {
if (MsgPackDoc.getRoot().getKind() == msgpack::Type::Nil)
@@ -711,6 +812,7 @@ void AMDGPUPALMetadata::toString(std::string &String) {
// a .note record of the specified AMD type. Returns an empty blob if
// there is no PAL metadata,
void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) {
+ ResolvedAll = DelayedExprs.resolveDelayedExpressions();
if (Type == ELF::NT_AMD_PAL_METADATA)
toLegacyBlob(Blob);
else if (Type)
@@ -906,11 +1008,17 @@ void AMDGPUPALMetadata::setLegacy() {
// Erase all PAL metadata.
void AMDGPUPALMetadata::reset() {
MsgPackDoc.clear();
+ REM.clear();
+ DelayedExprs.clear();
Registers = MsgPackDoc.getEmptyNode();
HwStages = MsgPackDoc.getEmptyNode();
ShaderFunctions = MsgPackDoc.getEmptyNode();
}
+bool AMDGPUPALMetadata::resolvedAllMCExpr() {
+ return ResolvedAll && DelayedExprs.empty();
+}
+
unsigned AMDGPUPALMetadata::getPALVersion(unsigned idx) {
assert(idx < 2 &&
"illegal index to PAL version - should be 0 (major) or 1 (minor)");
@@ -942,6 +1050,11 @@ void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, bool Val) {
getHwStage(CC)[field] = Val;
}
+void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field,
+ msgpack::Type Type, const MCExpr *Val) {
+ DelayedExprs.assignDocNode(getHwStage(CC)[field], Type, Val);
+}
+
void AMDGPUPALMetadata::setComputeRegisters(StringRef field, unsigned Val) {
getComputeRegisters()[field] = Val;
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 158f766d0485..e05532afed2f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -13,7 +13,10 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
+#include "AMDGPUDelayedMCExpr.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/BinaryFormat/MsgPackDocument.h"
+#include "llvm/MC/MCContext.h"
namespace llvm {
@@ -21,6 +24,10 @@ class Module;
class StringRef;
class AMDGPUPALMetadata {
+public:
+ using RegisterExprMap = DenseMap<unsigned, const MCExpr *>;
+
+private:
unsigned BlobType = 0;
msgpack::Document MsgPackDoc;
msgpack::DocNode Registers;
@@ -32,6 +39,10 @@ class AMDGPUPALMetadata {
msgpack::DocNode ComputeRegisters;
msgpack::DocNode GraphicsRegisters;
+ DelayedMCExprs DelayedExprs;
+ RegisterExprMap REM;
+ bool ResolvedAll = true;
+
public:
// Read the amdgpu.pal.metadata supplied by the frontend, ready for
// per-function modification.
@@ -45,10 +56,12 @@ public:
// Set the rsrc1 register in the metadata for a particular shader stage.
// In fact this ORs the value into any previous setting of the register.
void setRsrc1(unsigned CC, unsigned Val);
+ void setRsrc1(unsigned CC, const MCExpr *Val, MCContext &Ctx);
// Set the rsrc2 register in the metadata for a particular shader stage.
// In fact this ORs the value into any previous setting of the register.
void setRsrc2(unsigned CC, unsigned Val);
+ void setRsrc2(unsigned CC, const MCExpr *Val, MCContext &Ctx);
// Set the SPI_PS_INPUT_ENA register in the metadata.
// In fact this ORs the value into any previous setting of the register.
@@ -64,6 +77,7 @@ public:
// Set a register in the metadata.
// In fact this ORs the value into any previous setting of the register.
void setRegister(unsigned Reg, unsigned Val);
+ void setRegister(unsigned Reg, const MCExpr *Val, MCContext &Ctx);
// Set the entry point name for one shader.
void setEntryPoint(unsigned CC, StringRef Name);
@@ -72,18 +86,22 @@ public:
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of vgprs to allocate.
void setNumUsedVgprs(unsigned CC, unsigned Val);
+ void setNumUsedVgprs(unsigned CC, const MCExpr *Val, MCContext &Ctx);
// Set the number of used agprs in the metadata. This is an optional advisory
// record for logging etc;
void setNumUsedAgprs(unsigned CC, unsigned Val);
+ void setNumUsedAgprs(unsigned CC, const MCExpr *Val);
// Set the number of used sgprs in the metadata. This is an optional advisory
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of sgprs to allocate.
void setNumUsedSgprs(unsigned CC, unsigned Val);
+ void setNumUsedSgprs(unsigned CC, const MCExpr *Val, MCContext &Ctx);
// Set the scratch size in the metadata.
void setScratchSize(unsigned CC, unsigned Val);
+ void setScratchSize(unsigned CC, const MCExpr *Val, MCContext &Ctx);
// Set the stack frame size of a function in the metadata.
void setFunctionScratchSize(StringRef FnName, unsigned Val);
@@ -97,11 +115,13 @@ public:
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of vgprs to allocate.
void setFunctionNumUsedVgprs(StringRef FnName, unsigned Val);
+ void setFunctionNumUsedVgprs(StringRef FnName, const MCExpr *Val);
// Set the number of used sgprs in the metadata. This is an optional advisory
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of sgprs to allocate.
void setFunctionNumUsedSgprs(StringRef FnName, unsigned Val);
+ void setFunctionNumUsedSgprs(StringRef FnName, const MCExpr *Val);
// Set the hardware register bit in PAL metadata to enable wave32 on the
// shader of the given calling convention.
@@ -138,6 +158,8 @@ public:
void setHwStage(unsigned CC, StringRef field, unsigned Val);
void setHwStage(unsigned CC, StringRef field, bool Val);
+ void setHwStage(unsigned CC, StringRef field, msgpack::Type Type,
+ const MCExpr *Val);
void setComputeRegisters(StringRef field, unsigned Val);
void setComputeRegisters(StringRef field, bool Val);
@@ -156,6 +178,8 @@ public:
// Erase all PAL metadata.
void reset();
+ bool resolvedAllMCExpr();
+
private:
// Return whether the blob type is legacy PAL metadata.
bool isLegacy() const;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
index eaee1a2a9739..720d5a1853db 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
@@ -14,6 +14,7 @@
#include "AMDKernelCodeT.h"
#include "SIDefines.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/SIDefinesUtils.h"
#include "llvm/ADT/IndexedMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCContext.h"
@@ -220,43 +221,6 @@ static int get_amd_kernel_code_t_FieldIndex(StringRef name) {
return map.lookup(name) - 1; // returns -1 if not found
}
-static constexpr std::pair<unsigned, unsigned> getShiftMask(unsigned Value) {
- unsigned Shift = 0;
- unsigned Mask = 0;
-
- Mask = ~Value;
- for (; !(Mask & 1); Shift++, Mask >>= 1) {
- }
-
- return std::make_pair(Shift, Mask);
-}
-
-static const MCExpr *MaskShiftSet(const MCExpr *Val, uint32_t Mask,
- uint32_t Shift, MCContext &Ctx) {
- if (Mask) {
- const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
- Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
- }
- if (Shift) {
- const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
- Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx);
- }
- return Val;
-}
-
-static const MCExpr *MaskShiftGet(const MCExpr *Val, uint32_t Mask,
- uint32_t Shift, MCContext &Ctx) {
- if (Shift) {
- const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
- Val = MCBinaryExpr::createLShr(Val, ShiftExpr, Ctx);
- }
- if (Mask) {
- const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
- Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
- }
- return Val;
-}
-
class PrintField {
public:
template <typename T, T AMDGPUMCKernelCodeT::*ptr,
@@ -305,10 +269,10 @@ static ArrayRef<PrintFx> getPrinterTable() {
const MCExpr *Value; \
if (PGMType == 0) { \
Value = \
- MaskShiftGet(C.compute_pgm_resource1_registers, Mask, Shift, Ctx); \
+ maskShiftGet(C.compute_pgm_resource1_registers, Mask, Shift, Ctx); \
} else { \
Value = \
- MaskShiftGet(C.compute_pgm_resource2_registers, Mask, Shift, Ctx); \
+ maskShiftGet(C.compute_pgm_resource2_registers, Mask, Shift, Ctx); \
} \
int64_t Val; \
if (Value->evaluateAsAbsolute(Val)) \
@@ -392,7 +356,7 @@ static ArrayRef<ParseFx> getParserTable() {
if (!parseExpr(MCParser, Value, Err)) \
return false; \
auto [Shift, Mask] = getShiftMask(Complement); \
- Value = MaskShiftSet(Value, Mask, Shift, Ctx); \
+ Value = maskShiftSet(Value, Mask, Shift, Ctx); \
const MCExpr *Compl = MCConstantExpr::create(Complement, Ctx); \
if (PGMType == 0) { \
C.compute_pgm_resource1_registers = MCBinaryExpr::createAnd( \
@@ -542,7 +506,7 @@ void AMDGPUMCKernelCodeT::EmitKernelCodeT(MCStreamer &OS, MCContext &Ctx) {
const MCExpr *CodeProps = MCConstantExpr::create(code_properties, Ctx);
CodeProps = MCBinaryExpr::createOr(
CodeProps,
- MaskShiftSet(is_dynamic_callstack,
+ maskShiftSet(is_dynamic_callstack,
(1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1,
AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, Ctx),
Ctx);
diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 2f4ce8eaf1d6..09b8da9f5dd4 100644
--- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -1,6 +1,7 @@
add_llvm_component_library(LLVMAMDGPUUtils
AMDGPUAsmUtils.cpp
AMDGPUBaseInfo.cpp
+ AMDGPUDelayedMCExpr.cpp
AMDGPUMemoryUtils.cpp
AMDGPUPALMetadata.cpp
AMDKernelCodeTUtils.cpp
diff --git a/llvm/lib/Target/AMDGPU/Utils/SIDefinesUtils.h b/llvm/lib/Target/AMDGPU/Utils/SIDefinesUtils.h
new file mode 100644
index 000000000000..64d21de12c26
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/SIDefinesUtils.h
@@ -0,0 +1,79 @@
+//===-- SIDefines.h - SI Helper Functions -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+/// \file - utility functions for the SIDefines and its common uses.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_SIDEFINESUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_SIDEFINESUTILS_H
+
+#include "llvm/MC/MCExpr.h"
+#include <utility>
+
+namespace llvm {
+class MCContext;
+namespace AMDGPU {
+
+/// Deduce the least significant bit aligned shift and mask values for a binary
+/// Complement \p Value (as they're defined in SIDefines.h as C_*) as a returned
+/// pair<shift, mask>. That is to say \p Value == ~(mask << shift)
+///
+/// For example, given C_00B848_FWD_PROGRESS (i.e., 0x7FFFFFFF) from
+/// SIDefines.h, this will return the pair as (31,1).
+constexpr std::pair<unsigned, unsigned> getShiftMask(unsigned Value) {
+ unsigned Shift = 0;
+ unsigned Mask = 0;
+
+ Mask = ~Value;
+ for (; !(Mask & 1); Shift++, Mask >>= 1) {
+ }
+
+ return std::make_pair(Shift, Mask);
+}
+
+/// Provided with the MCExpr * \p Val, uint32 \p Mask and \p Shift, will return
+/// the masked and left shifted, in said order of operations, MCExpr * created
+/// within the MCContext \p Ctx.
+///
+/// For example, given MCExpr *Val, Mask == 0xf, Shift == 6 the returned MCExpr
+/// * will be the equivalent of (Val & 0xf) << 6
+inline const MCExpr *maskShiftSet(const MCExpr *Val, uint32_t Mask,
+ uint32_t Shift, MCContext &Ctx) {
+ if (Mask) {
+ const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
+ Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
+ }
+ if (Shift) {
+ const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
+ Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx);
+ }
+ return Val;
+}
+
+/// Provided with the MCExpr * \p Val, uint32 \p Mask and \p Shift, will return
+/// the right shifted and masked, in said order of operations, MCExpr * created
+/// within the MCContext \p Ctx.
+///
+/// For example, given MCExpr *Val, Mask == 0xf, Shift == 6 the returned MCExpr
+/// * will be the equivalent of (Val >> 6) & 0xf
+inline const MCExpr *maskShiftGet(const MCExpr *Val, uint32_t Mask,
+ uint32_t Shift, MCContext &Ctx) {
+ if (Shift) {
+ const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
+ Val = MCBinaryExpr::createLShr(Val, ShiftExpr, Ctx);
+ }
+ if (Mask) {
+ const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
+ Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
+ }
+ return Val;
+}
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_SIDEFINESUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index b96c41c1e12a..2c0d61ee4afa 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -112,7 +112,7 @@ class getVOP1Pat <SDPatternOperator node, VOPProfile P> : LetDummies {
!if(P.HasOMod,
[(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0,
i1:$clamp, i32:$omod))))],
- [(set P.DstVT:$vdst, (node P.Src0RC32:$src0))]
+ [(set P.DstVT:$vdst, (node (P.Src0VT P.Src0RC32:$src0)))]
)
);
}
@@ -249,9 +249,15 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
// FIXME: Specify SchedRW for READFIRSTLANE_B32
// TODO: There is VOP3 encoding also
def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
- getVOP1Pat<int_amdgcn_readfirstlane,
- VOP_READFIRSTLANE>.ret, 1> {
+ [], 1> {
let isConvergent = 1;
+ let IsInvalidSingleUseConsumer = 1;
+}
+
+foreach vt = Reg32Types.types in {
+ def : GCNPat<(vt (int_amdgcn_readfirstlane (vt VRegOrLdsSrc_32:$src0))),
+ (V_READFIRSTLANE_B32 (vt VRegOrLdsSrc_32:$src0))
+ >;
}
let isReMaterializable = 1 in {
@@ -362,6 +368,7 @@ defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
def VOP_MOVRELS : VOPProfile<[i32, i32, untyped, untyped]> {
let Src0RC32 = VRegSrc_32;
let Src0RC64 = VRegSrc_32;
+ let IsInvalidSingleUseConsumer = 1;
}
// Special case because there are no true output operands. Hack vdst
@@ -405,8 +412,12 @@ class VOP_MOVREL<RegisterOperand Src1RC> : VOPProfile<[untyped, i32, untyped, un
let EmitDst = 1; // force vdst emission
}
-def VOP_MOVRELD : VOP_MOVREL<VSrc_b32>;
-def VOP_MOVRELSD : VOP_MOVREL<VRegSrc_32>;
+let IsInvalidSingleUseProducer = 1 in {
+ def VOP_MOVRELD : VOP_MOVREL<VSrc_b32>;
+ def VOP_MOVRELSD : VOP_MOVREL<VRegSrc_32> {
+ let IsInvalidSingleUseConsumer = 1;
+ }
+}
let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in {
// v_movreld_b32 is a special case because the destination output
@@ -535,6 +546,7 @@ let SubtargetPredicate = isGFX9Plus in {
let Constraints = "$vdst = $src1, $vdst1 = $src0";
let DisableEncoding = "$vdst1,$src1";
let SchedRW = [Write64Bit, Write64Bit];
+ let IsInvalidSingleUseConsumer = 1;
}
let isReMaterializable = 1 in
@@ -699,6 +711,8 @@ let SubtargetPredicate = isGFX10Plus in {
let Constraints = "$vdst = $src1, $vdst1 = $src0";
let DisableEncoding = "$vdst1,$src1";
let SchedRW = [Write64Bit, Write64Bit];
+ let IsInvalidSingleUseConsumer = 1;
+ let IsInvalidSingleUseProducer = 1;
}
} // End Uses = [M0]
} // End SubtargetPredicate = isGFX10Plus
@@ -718,15 +732,22 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
let SubtargetPredicate = isGFX11Plus in {
// Restrict src0 to be VGPR
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
- getVOP1Pat<int_amdgcn_permlane64,
- VOP_MOVRELS>.ret,
- /*VOP1Only=*/ 1>;
+ [], /*VOP1Only=*/ 1> {
+ let IsInvalidSingleUseConsumer = 1;
+ let IsInvalidSingleUseProducer = 1;
+ }
defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
} // End SubtargetPredicate = isGFX11Plus
+foreach vt = Reg32Types.types in {
+ def : GCNPat<(int_amdgcn_permlane64 (vt VRegSrc_32:$src0)),
+ (vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0)))
+ >;
+}
+
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index ccb5b33dbdc4..9989752c2f6b 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -779,15 +779,25 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag,
} // End isCommutable = 1
// These are special and do not read the exec mask.
-let isConvergent = 1, Uses = []<Register> in {
-def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
- [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>;
+let isConvergent = 1, Uses = []<Register>, IsInvalidSingleUseConsumer = 1 in {
+def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, []>;
let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
-def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
- [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>;
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []> {
+ let IsInvalidSingleUseProducer = 1;
+ }
} // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in
} // End isConvergent = 1
+foreach vt = Reg32Types.types in {
+ def : GCNPat<(vt (int_amdgcn_readlane vt:$src0, i32:$src1)),
+ (V_READLANE_B32 VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1)
+ >;
+
+ def : GCNPat<(vt (int_amdgcn_writelane vt:$src0, i32:$src1, vt:$src2)),
+ (V_WRITELANE_B32 SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$src2)
+ >;
+}
+
let isReMaterializable = 1 in {
defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32, add_ctpop>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 479c0aaf0174..efa8e9c74d44 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -13,9 +13,11 @@ def VOP_F32_F32_F32_F32_VCC : VOPProfile<[f32, f32, f32, f32]> {
let Outs64 = (outs DstRC.RegClass:$vdst);
let HasExtVOP3DPP = 0;
let HasExtDPP = 0;
+ let IsSingle = 1;
}
def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> {
let Outs64 = (outs DstRC.RegClass:$vdst);
+ let IsSingle = 1;
}
}
@@ -105,7 +107,7 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod,
}
class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
-
+ let IsSingle = 1;
let HasOMod = !ne(DstVT.Value, f16.Value);
let HasHigh = 1;
@@ -155,12 +157,12 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_l
} // End SubtargetPredicate = isNotGFX12Plus
} // End SchedRW = [WriteDoubleAdd]
-let SchedRW = [WriteIntMul] in {
+let SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1 in {
defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", V_MUL_PROF<VOP_I32_I32_I32>, DivergentBinFrag<mul>>;
defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF<VOP_I32_I32_I32>, mulhu>;
defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>;
defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs>;
-} // End SchedRW = [WriteIntMul]
+} // End SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1
let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>;
@@ -258,9 +260,9 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d
let isReMaterializable = 1 in
defm V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-let Constraints = "@earlyclobber $vdst" in {
+let Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1 in {
defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
-} // End Constraints = "@earlyclobber $vdst"
+} // End Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1
let isReMaterializable = 1 in {
@@ -275,14 +277,16 @@ let SchedRW = [Write64Bit] in {
defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, csra_64>;
} // End SubtargetPredicate = isGFX6GFX7
+ let IsInvalidSingleUseConsumer = 1 in {
let SubtargetPredicate = isGFX8Plus in {
defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshr_rev_64>;
defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, cashr_rev_64>;
- } // End SubtargetPredicate = isGFX8Plus
+ } // End SubtargetPredicate = isGFX8Plus, , IsInvalidSingleUseConsumer = 1
let SubtargetPredicate = isGFX8GFX9GFX10GFX11 in {
defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshl_rev_64>;
} // End SubtargetPredicate = isGFX8GFX9GFX10GFX11
+ } // End IsInvalidSingleUseConsumer = 1
} // End SchedRW = [Write64Bit]
} // End isReMaterializable = 1
@@ -307,14 +311,14 @@ def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> {
let HasModifiers = 0;
}
-let SubtargetPredicate = isGFX7Plus in {
+let SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1 in {
let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>;
} // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
-} // End SubtargetPredicate = isGFX7Plus
+} // End SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1
-let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
+let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1 in {
let SubtargetPredicate = isGFX7Plus, OtherPredicates = [HasNotMADIntraFwdBug] in {
defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
@@ -324,7 +328,7 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
defm V_MAD_U64_U32_gfx11 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
defm V_MAD_I64_I32_gfx11 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
}
-} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU]
+} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1
let FPDPRounding = 1 in {
@@ -838,9 +842,9 @@ def gi_opsel_i1timm : GICustomOperandRenderer<"renderOpSelTImm">,
GISDNodeXFormEquiv<opsel_i1timm>;
class PermlanePat<SDPatternOperator permlane,
- Instruction inst> : GCNPat<
- (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2,
- timm:$fi, timm:$bc),
+ Instruction inst, ValueType vt> : GCNPat<
+ (vt (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2,
+ timm:$fi, timm:$bc)),
(inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc),
SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in)
>;
@@ -859,13 +863,15 @@ let SubtargetPredicate = isGFX10Plus in {
} // End isCommutable = 1, isReMaterializable = 1
def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
- let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+ let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in", IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1 in {
defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>;
defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
- } // End $vdst = $vdst_in, DisableEncoding $vdst_in
+ } // End $vdst = $vdst_in, DisableEncoding $vdst_in, IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1
- def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64>;
- def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64>;
+ foreach vt = Reg32Types.types in {
+ def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
+ def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
+ }
defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
@@ -1275,11 +1281,12 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
-defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>;
-
-let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
- defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>;
-} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
+let IsInvalidSingleUseConsumer = 1 in {
+ defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>;
+ let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1 in {
+ defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>;
+ } // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32: $src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1
+} // End IsInvalidSingleUseConsumer = 1
let SubtargetPredicate = isGFX10Before1030 in {
defm V_MUL_LO_I32 : VOP3_Real_gfx10<0x16b>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 4c78bd94458d..4cab15435199 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -90,7 +90,7 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
let isReMaterializable = 1 in {
let isCommutable = 1 in {
defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
-defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>, imad>;
let FPDPRounding = 1 in {
defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
@@ -382,15 +382,19 @@ defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
AMDGPUfdot2, 1/*ExplicitClamp*/>;
let OtherPredicates = [HasDot7Insts] in {
-defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
- VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+let IsInvalidSingleUseConsumer = 1 in {
+ defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+}
defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
} // End OtherPredicates = [HasDot7Insts]
let OtherPredicates = [HasDot1Insts] in {
-defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
- VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+let IsInvalidSingleUseConsumer = 1 in {
+ defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+}
defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
} // End OtherPredicates = [HasDot1Insts]
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 372c4f533629..3bcee28a2cb7 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -435,8 +435,10 @@ multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL,
multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>;
-multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
+let IsInvalidSingleUseConsumer = 1 in {
+ multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
+}
multiclass VOPCX_F16<string opName, string revOp = opName> {
let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
@@ -465,8 +467,10 @@ multiclass VOPCX_I16<string opName, string revOp = opName> {
multiclass VOPCX_I32 <string opName, string revOp = opName> :
VOPCX_Pseudos <opName, VOPC_I1_I32_I32, VOPC_I32_I32, COND_NULL, revOp>;
-multiclass VOPCX_I64 <string opName, string revOp = opName> :
- VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
+let IsInvalidSingleUseConsumer = 1 in {
+ multiclass VOPCX_I64 <string opName, string revOp = opName> :
+ VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
+}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 5d1573d8dec1..2b05165cc94b 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -17,6 +17,8 @@ class LetDummies {
bit isReMaterializable;
bit isAsCheapAsAMove;
bit FPDPRounding;
+ bit IsInvalidSingleUseConsumer;
+ bit IsInvalidSingleUseProducer;
Predicate SubtargetPredicate;
string Constraints;
string DisableEncoding;
@@ -81,6 +83,8 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
string Mnemonic = opName;
Instruction Opcode = !cast<Instruction>(NAME);
bit IsTrue16 = P.IsTrue16;
+ bit IsInvalidSingleUseConsumer = P.IsInvalidSingleUseConsumer;
+ bit IsInvalidSingleUseProducer = P.IsInvalidSingleUseProducer;
VOPProfile Pfl = P;
string AsmOperands;
@@ -175,6 +179,8 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
class VOP_Real<VOP_Pseudo ps> {
Instruction Opcode = !cast<Instruction>(NAME);
bit IsSingle = ps.Pfl.IsSingle;
+ bit IsInvalidSingleUseConsumer = ps.Pfl.IsInvalidSingleUseConsumer;
+ bit IsInvalidSingleUseProducer = ps.Pfl.IsInvalidSingleUseProducer;
}
class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
@@ -823,17 +829,11 @@ class VOP3P_DPPe_Common<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P
class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
dag Ins = P.InsDPP, string asmOps = P.AsmDPP> :
- InstSI <P.OutsDPP, Ins, OpName#asmOps, pattern>,
- VOP <OpName>,
- SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE> {
-
- let isPseudo = 1;
- let isCodeGenOnly = 1;
+ VOP_Pseudo<OpName, "_dpp", P, P.OutsDPP, Ins, asmOps, pattern> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let UseNamedOperandTable = 1;
let VALU = 1;
let DPP = 1;
@@ -846,7 +846,6 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
let isConvergent = 1;
- string Mnemonic = OpName;
string AsmOperands = asmOps;
let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
@@ -857,7 +856,8 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
let DecoderNamespace = "GFX8";
- VOPProfile Pfl = P;
+ let IsInvalidSingleUseConsumer = !not(VINTERP);
+ let IsInvalidSingleUseProducer = !not(VINTERP);
}
class VOP3_DPP_Pseudo <string OpName, VOPProfile P> :
@@ -1725,3 +1725,12 @@ def VOPTrue16Table : GenericTable {
let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getTrue16OpcodeHelper";
}
+
+def SingleUseExceptionTable : GenericTable {
+ let FilterClass = "VOP_Pseudo";
+ let CppTypeName = "SingleUseExceptionInfo";
+ let Fields = ["Opcode", "IsInvalidSingleUseConsumer", "IsInvalidSingleUseProducer"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getSingleUseExceptionHelper";
+}