diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
98 files changed, 2894 insertions, 1893 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index d0d7a9dc1724..63d83346528a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -351,6 +351,7 @@ def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts", "GFX90AInsts", "true", "Additional instructions for GFX90A+" + // [HasAtomicFMinFMaxF64GlobalInsts, HasAtomicFMinFMaxF64FlatInsts] // TODO >; def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts", @@ -711,6 +712,30 @@ def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts", [FeatureFlatGlobalInsts] >; +def FeatureAtomicFMinFMaxF32GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f32", + "HasAtomicFMinFMaxF32GlobalInsts", + "true", + "Has global/buffer instructions for atomicrmw fmin/fmax for float" +>; + +def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f64", + "HasAtomicFMinFMaxF64GlobalInsts", + "true", + "Has global/buffer instructions for atomicrmw fmin/fmax for float" +>; + +def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32", + "HasAtomicFMinFMaxF32FlatInsts", + "true", + "Has flat memory instructions for atomicrmw fmin/fmax for float" +>; + +def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64", + "HasAtomicFMinFMaxF64FlatInsts", + "true", + "Has flat memory instructions for atomicrmw fmin/fmax for double" +>; + def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts", "HasAtomicFaddNoRtnInsts", "true", @@ -743,6 +768,12 @@ def FeatureAtomicGlobalPkAddBF16Inst : SubtargetFeature<"atomic-global-pk-add-bf [FeatureFlatGlobalInsts] >; +def FeatureAtomicBufferPkAddBF16Inst : SubtargetFeature<"atomic-buffer-pk-add-bf16-inst", + "HasAtomicBufferPkAddBF16Inst", + "true", + "Has buffer_atomic_pk_add_bf16 instruction" +>; + def FeatureAtomicCSubNoRtnInsts : SubtargetFeature<"atomic-csub-no-rtn-insts", "HasAtomicCSubNoRtnInsts", "true", @@ -1061,7 +1092,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts, - FeatureGDS, FeatureGWS, FeatureDefaultComponentZero + FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts ] >; @@ -1072,7 +1104,9 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess, - FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero + FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, + FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts ] >; @@ -1127,7 +1161,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, - FeatureMaxHardClauseLength63 + FeatureMaxHardClauseLength63, + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, + FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts ] >; @@ -1148,7 +1184,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureA16, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, - FeatureMaxHardClauseLength32 + FeatureMaxHardClauseLength32, + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts ] >; @@ -1169,7 +1206,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", FeatureA16, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast, - FeatureMaxHardClauseLength32 + FeatureMaxHardClauseLength32, + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts ] >; @@ -1332,7 +1370,10 @@ def FeatureISAVersion9_0_A : FeatureSet< FeaturePackedTID, FullRate64Ops, FeatureBackOffBarrier, - FeatureKernargPreload])>; + FeatureKernargPreload, + FeatureAtomicFMinFMaxF64GlobalInsts, + FeatureAtomicFMinFMaxF64FlatInsts + ])>; def FeatureISAVersion9_0_C : FeatureSet< !listconcat(FeatureISAVersion9_0_Consumer_Common.Features, @@ -1372,7 +1413,10 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeatureArchitectedFlatScratch, FullRate64Ops, FeatureBackOffBarrier, - FeatureKernargPreload]>; + FeatureKernargPreload, + FeatureAtomicFMinFMaxF64GlobalInsts, + FeatureAtomicFMinFMaxF64FlatInsts + ]>; def FeatureISAVersion9_4_0 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, @@ -1561,6 +1605,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureAtomicFlatPkAdd16Insts, FeatureAtomicBufferGlobalPkAddF16Insts, FeatureAtomicGlobalPkAddBF16Inst, + FeatureAtomicBufferPkAddBF16Inst, FeatureFlatAtomicFaddF32Inst, FeatureImageInsts, FeatureExtendedImageInsts, @@ -1572,7 +1617,9 @@ def FeatureISAVersion12 : FeatureSet< FeatureHasRestrictedSOffset, FeatureVGPRSingleUseHintInsts, FeatureScalarDwordx3Loads, - FeatureDPPSrc1SGPR]>; + FeatureDPPSrc1SGPR, + FeatureMaxHardClauseLength32, + Feature1_5xVGPRs]>; def FeatureISAVersion12_Generic: FeatureSet< !listconcat(FeatureISAVersion12.Features, @@ -1862,9 +1909,28 @@ def isGFX12Plus : def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, AssemblerPredicate<(all_of FeatureFlatAddressSpace)>; -def HasBufferFlatGlobalAtomicsF64 : + +def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">, - AssemblerPredicate<(any_of FeatureGFX90AInsts)>; + // FIXME: This is too coarse, and working around using pseudo's predicates on real instruction. + AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, FeatureSouthernIslands, FeatureSeaIslands)>; + +def HasAtomicFMinFMaxF32GlobalInsts : + Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">, + AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32GlobalInsts)>; + +def HasAtomicFMinFMaxF64GlobalInsts : + Predicate<"Subtarget->hasAtomicFMinFMaxF64GlobalInsts()">, + AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64GlobalInsts)>; + +def HasAtomicFMinFMaxF32FlatInsts : + Predicate<"Subtarget->hasAtomicFMinFMaxF32FlatInsts()">, + AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32FlatInsts)>; + +def HasAtomicFMinFMaxF64FlatInsts : + Predicate<"Subtarget->hasAtomicFMinFMaxF64FlatInsts()">, + AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64FlatInsts)>; + def HasLdsAtomicAddF64 : Predicate<"Subtarget->hasLdsAtomicAddF64()">, AssemblerPredicate<(any_of FeatureGFX90AInsts)>; @@ -2118,7 +2184,10 @@ def HasAtomicBufferGlobalPkAddF16Insts AssemblerPredicate<(all_of FeatureAtomicBufferGlobalPkAddF16Insts)>; def HasAtomicGlobalPkAddBF16Inst : Predicate<"Subtarget->hasAtomicGlobalPkAddBF16Inst()">, - AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>; + AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>; +def HasAtomicBufferPkAddBF16Inst + : Predicate<"Subtarget->hasAtomicBufferPkAddBF16Inst()">, + AssemblerPredicate<(all_of FeatureAtomicBufferPkAddBF16Inst)>; def HasFlatAtomicFaddF32Inst : Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">, AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index de25f9241a50..f57fc168c1df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -115,6 +115,9 @@ AMDGPUFunctionArgInfo::getPreloadedValue( return std::tuple( PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr, &AMDGPU::SGPR_32RegClass, LLT::scalar(32)); + case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_SIZE: + return {PrivateSegmentSize ? &PrivateSegmentSize : nullptr, + &AMDGPU::SGPR_32RegClass, LLT::scalar(32)}; case AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR: return std::tuple(KernargSegmentPtr ? &KernargSegmentPtr : nullptr, &AMDGPU::SGPR_64RegClass, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index 42b33c50d9f8..2e02bb4271ad 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -114,11 +114,12 @@ struct AMDGPUFunctionArgInfo { PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, IMPLICIT_BUFFER_PTR = 15, IMPLICIT_ARG_PTR = 16, + PRIVATE_SEGMENT_SIZE = 17, // VGPRS: - WORKITEM_ID_X = 17, - WORKITEM_ID_Y = 18, - WORKITEM_ID_Z = 19, + WORKITEM_ID_X = 18, + WORKITEM_ID_Y = 19, + WORKITEM_ID_Z = 20, FIRST_VGPR_VALUE = WORKITEM_ID_X }; // clang-format on diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index cad4a3430327..e49925f86bd9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -29,6 +29,7 @@ #include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDKernelCodeTUtils.h" +#include "Utils/SIDefinesUtils.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -135,15 +136,6 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { getTargetStreamer()->getPALMetadata()->readFromIR(M); } -uint64_t AMDGPUAsmPrinter::getMCExprValue(const MCExpr *Value, MCContext &Ctx) { - int64_t Val; - if (!Value->evaluateAsAbsolute(Val)) { - Ctx.reportError(SMLoc(), "could not resolve expression when required."); - return 0; - } - return static_cast<uint64_t>(Val); -} - void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { // Init target streamer if it has not yet happened if (!IsTargetStreamerInitialized) @@ -248,14 +240,14 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { getNameWithPrefix(KernelName, &MF->getFunction()); getTargetStreamer()->EmitAmdhsaKernelDescriptor( STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), - getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Context), - getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Context) - - IsaInfo::getNumExtraSGPRs( - &STM, getMCExprValue(CurrentProgramInfo.VCCUsed, Context), - getMCExprValue(CurrentProgramInfo.FlatUsed, Context), - getTargetStreamer()->getTargetID()->isXnackOnOrAny()), - getMCExprValue(CurrentProgramInfo.VCCUsed, Context), - getMCExprValue(CurrentProgramInfo.FlatUsed, Context)); + CurrentProgramInfo.NumVGPRsForWavesPerEU, + MCBinaryExpr::createSub( + CurrentProgramInfo.NumSGPRsForWavesPerEU, + AMDGPUMCExpr::createExtraSGPRs( + CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, + getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context), + Context), + CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed); Streamer.popSection(); } @@ -400,9 +392,40 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments( false); } -uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( +SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) { + SmallString<128> Str; + raw_svector_ostream OSS(Str); + int64_t IVal; + if (Value->evaluateAsAbsolute(IVal)) { + OSS << static_cast<uint64_t>(IVal); + } else { + Value->print(OSS, MAI); + } + return Str; +} + +void AMDGPUAsmPrinter::emitCommonFunctionComments( + const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR, + const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize, + const AMDGPUMachineFunction *MFI) { + OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); + OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false); + OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false); + if (NumAGPR && TotalNumVGPR) { + OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false); + OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR), + false); + } + OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize), + false); + OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), + false); +} + +const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( const MachineFunction &MF) const { const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + MCContext &Ctx = MF.getContext(); uint16_t KernelCodeProperties = 0; const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); @@ -430,16 +453,28 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; } + if (UserSGPRInfo.hasPrivateSegmentSize()) { + KernelCodeProperties |= + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; + } if (MF.getSubtarget<GCNSubtarget>().isWave32()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; } - if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, MF.getContext()) && - CodeObjectVersion >= AMDGPU::AMDHSA_COV5) - KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK; - - return KernelCodeProperties; + // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be + // un-evaluatable at this point so it cannot be conditionally checked here. + // Instead, we'll directly shift the possibly unknown MCExpr into its place + // and bitwise-or it into KernelCodeProperties. + const MCExpr *KernelCodePropExpr = + MCConstantExpr::create(KernelCodeProperties, Ctx); + const MCExpr *OrValue = MCConstantExpr::create( + amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx); + OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack, + OrValue, Ctx); + KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx); + + return KernelCodePropExpr; } MCKernelDescriptor @@ -462,11 +497,15 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF, KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx); KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx); - KernelDescriptor.kernel_code_properties = - MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx); - - assert(STM.hasGFX90AInsts() || - getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0); + KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); + + int64_t PGRM_Rsrc3 = 1; + bool EvaluatableRsrc3 = + CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3); + (void)PGRM_Rsrc3; + (void)EvaluatableRsrc3; + assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 || + static_cast<uint64_t>(PGRM_Rsrc3) == 0); KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A; KernelDescriptor.kernarg_preload = MCConstantExpr::create( @@ -554,13 +593,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->emitRawComment(" Kernel info:", false); emitCommonFunctionComments( - getMCExprValue(CurrentProgramInfo.NumArchVGPR, Ctx), - STM.hasMAIInsts() ? getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx) - : std::optional<uint32_t>(), - getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx), - getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx), - getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx), - getFunctionCodeSize(MF), MFI); + CurrentProgramInfo.NumArchVGPR, + STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr, + CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, + CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI); OutStreamer->emitRawComment( " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); @@ -571,43 +607,38 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { " bytes/workgroup (compile time only)", false); OutStreamer->emitRawComment( - " SGPRBlocks: " + - Twine(getMCExprValue(CurrentProgramInfo.SGPRBlocks, Ctx)), - false); + " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false); + OutStreamer->emitRawComment( - " VGPRBlocks: " + - Twine(getMCExprValue(CurrentProgramInfo.VGPRBlocks, Ctx)), - false); + " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false); OutStreamer->emitRawComment( " NumSGPRsForWavesPerEU: " + - Twine( - getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)), + getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU), false); OutStreamer->emitRawComment( " NumVGPRsForWavesPerEU: " + - Twine( - getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)), + getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); - if (STM.hasGFX90AInsts()) + if (STM.hasGFX90AInsts()) { + const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd( + CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx); + AdjustedAccum = MCBinaryExpr::createMul( + AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx); OutStreamer->emitRawComment( - " AccumOffset: " + - Twine((getMCExprValue(CurrentProgramInfo.AccumOffset, Ctx) + 1) * - 4), - false); + " AccumOffset: " + getMCExprStr(AdjustedAccum), false); + } OutStreamer->emitRawComment( - " Occupancy: " + - Twine(getMCExprValue(CurrentProgramInfo.Occupancy, Ctx)), - false); + " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false); OutStreamer->emitRawComment( " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); OutStreamer->emitRawComment( " COMPUTE_PGM_RSRC2:SCRATCH_EN: " + - Twine(getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx)), + getMCExprStr(CurrentProgramInfo.ScratchEnable), false); OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + Twine(CurrentProgramInfo.UserSGPR), @@ -628,20 +659,25 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { Twine(CurrentProgramInfo.TIdIGCompCount), false); + [[maybe_unused]] int64_t PGMRSrc3; assert(STM.hasGFX90AInsts() || - getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0); + (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute( + PGMRSrc3) && + static_cast<uint64_t>(PGMRSrc3) == 0)); if (STM.hasGFX90AInsts()) { OutStreamer->emitRawComment( " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " + - Twine((AMDHSA_BITS_GET( - getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx), - amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))), + getMCExprStr(MCKernelDescriptor::bits_get( + CurrentProgramInfo.ComputePGMRSrc3GFX90A, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)), false); OutStreamer->emitRawComment( " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " + - Twine((AMDHSA_BITS_GET( - getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx), - amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))), + getMCExprStr(MCKernelDescriptor::bits_get( + CurrentProgramInfo.ComputePGMRSrc3GFX90A, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)), false); } } @@ -765,7 +801,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // The calculations related to SGPR/VGPR blocks are // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be // unified. - const MCExpr *ExtraSGPRs = AMDGPUVariadicMCExpr::createExtraSGPRs( + const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs( ProgInfo.VCCUsed, ProgInfo.FlatUsed, getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx); @@ -858,27 +894,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } } } - ProgInfo.NumSGPR = AMDGPUVariadicMCExpr::createMax( + ProgInfo.NumSGPR = AMDGPUMCExpr::createMax( {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx); - ProgInfo.NumArchVGPR = AMDGPUVariadicMCExpr::createMax( + ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax( {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx); - ProgInfo.NumVGPR = AMDGPUVariadicMCExpr::createTotalNumVGPR( + ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); } // Adjust number of registers used to meet default/requested minimum/maximum // number of waves per execution unit request. unsigned MaxWaves = MFI->getMaxWavesPerEU(); - ProgInfo.NumSGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax( - {ProgInfo.NumSGPR, CreateExpr(1ul), - CreateExpr(STM.getMinNumSGPRs(MaxWaves))}, - Ctx); - ProgInfo.NumVGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax( - {ProgInfo.NumVGPR, CreateExpr(1ul), - CreateExpr(STM.getMinNumVGPRs(MaxWaves))}, - Ctx); + ProgInfo.NumSGPRsForWavesPerEU = + AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul), + CreateExpr(STM.getMinNumSGPRs(MaxWaves))}, + Ctx); + ProgInfo.NumVGPRsForWavesPerEU = + AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul), + CreateExpr(STM.getMinNumVGPRs(MaxWaves))}, + Ctx); if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || STM.hasSGPRInitBug()) { @@ -927,10 +963,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, unsigned Granule) { const MCExpr *OneConst = CreateExpr(1ul); const MCExpr *GranuleConst = CreateExpr(Granule); - const MCExpr *MaxNumGPR = - AMDGPUVariadicMCExpr::createMax({NumGPR, OneConst}, Ctx); + const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx); const MCExpr *AlignToGPR = - AMDGPUVariadicMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx); + AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx); const MCExpr *DivGPR = MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx); const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx); @@ -972,7 +1007,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // The MCExpr equivalent of divideCeil. auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) { const MCExpr *Ceil = - AMDGPUVariadicMCExpr::createAlignTo(Numerator, Denominator, Ctx); + AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx); return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx); }; @@ -1045,7 +1080,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT); } - ProgInfo.Occupancy = AMDGPUVariadicMCExpr::createOccupancy( + ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy( STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx); @@ -1207,41 +1242,49 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, auto &Ctx = MF.getContext(); MD->setEntryPoint(CC, MF.getFunction().getName()); - MD->setNumUsedVgprs( - CC, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)); + MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx); // Only set AGPRs for supported devices const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); if (STM.hasMAIInsts()) { - MD->setNumUsedAgprs(CC, getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx)); + MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR); } - MD->setNumUsedSgprs( - CC, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)); + MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx); if (MD->getPALMajorVersion() < 3) { - MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM)); + MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx); if (AMDGPU::isCompute(CC)) { - MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2()); + MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); } else { - if (getMCExprValue(CurrentProgramInfo.ScratchBlocks, Ctx) > 0) - MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); + const MCExpr *HasScratchBlocks = + MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks, + MCConstantExpr::create(0, Ctx), Ctx); + auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN); + MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx); } } else { MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode); - MD->setHwStage(CC, ".scratch_en", - (bool)getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx)); + MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean, + CurrentProgramInfo.ScratchEnable); EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM); } // ScratchSize is in bytes, 16 aligned. MD->setScratchSize( - CC, alignTo(getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx), 16)); + CC, + AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize, + MCConstantExpr::create(16, Ctx), Ctx), + Ctx); + if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) : CurrentProgramInfo.LDSBlocks; if (MD->getPALMajorVersion() < 3) { - MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); + MD->setRsrc2( + CC, + MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx), + Ctx); MD->setSpiPsInputEna(MFI->getPSInputEnable()); MD->setSpiPsInputAddr(MFI->getPSInputAddr()); } else { @@ -1288,20 +1331,19 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { if (MD->getPALMajorVersion() < 3) { // Set compute registers - MD->setRsrc1(CallingConv::AMDGPU_CS, - CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST)); + MD->setRsrc1( + CallingConv::AMDGPU_CS, + CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx); MD->setRsrc2(CallingConv::AMDGPU_CS, - CurrentProgramInfo.getComputePGMRSrc2()); + CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); } else { EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST); } // Set optional info MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize); - MD->setFunctionNumUsedVgprs( - FnName, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)); - MD->setFunctionNumUsedSgprs( - FnName, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)); + MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU); + MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU); } // This is supposed to be log2(Size) @@ -1362,6 +1404,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out, if (UserSGPRInfo.hasFlatScratchInit()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + if (UserSGPRInfo.hasPrivateSegmentSize()) + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; + if (UserSGPRInfo.hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; @@ -1463,28 +1508,26 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks( // remarks to simulate newlines. If and when clang does accept newlines, this // formatting should be aggregated into one remark with newlines to avoid // printing multiple diagnostic location and diag opts. - MCContext &MCCtx = MF.getContext(); EmitResourceUsageRemark("FunctionName", "Function Name", MF.getFunction().getName()); EmitResourceUsageRemark("NumSGPR", "SGPRs", - getMCExprValue(CurrentProgramInfo.NumSGPR, MCCtx)); - EmitResourceUsageRemark( - "NumVGPR", "VGPRs", - getMCExprValue(CurrentProgramInfo.NumArchVGPR, MCCtx)); + getMCExprStr(CurrentProgramInfo.NumSGPR)); + EmitResourceUsageRemark("NumVGPR", "VGPRs", + getMCExprStr(CurrentProgramInfo.NumArchVGPR)); if (hasMAIInsts) { - EmitResourceUsageRemark( - "NumAGPR", "AGPRs", - getMCExprValue(CurrentProgramInfo.NumAccVGPR, MCCtx)); + EmitResourceUsageRemark("NumAGPR", "AGPRs", + getMCExprStr(CurrentProgramInfo.NumAccVGPR)); } - EmitResourceUsageRemark( - "ScratchSize", "ScratchSize [bytes/lane]", - getMCExprValue(CurrentProgramInfo.ScratchSize, MCCtx)); + EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]", + getMCExprStr(CurrentProgramInfo.ScratchSize)); + int64_t DynStack; + bool DynStackEvaluatable = + CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack); StringRef DynamicStackStr = - getMCExprValue(CurrentProgramInfo.DynamicCallStack, MCCtx) ? "True" - : "False"; + DynStackEvaluatable && DynStack ? "True" : "False"; EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr); EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]", - getMCExprValue(CurrentProgramInfo.Occupancy, MCCtx)); + getMCExprStr(CurrentProgramInfo.Occupancy)); EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill", CurrentProgramInfo.SGPRSpill); EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 87156f27fc6c..f70a60aef007 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -65,12 +65,16 @@ private: uint32_t TotalNumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, const AMDGPUMachineFunction *MFI); + void emitCommonFunctionComments(const MCExpr *NumVGPR, const MCExpr *NumAGPR, + const MCExpr *TotalNumVGPR, + const MCExpr *NumSGPR, + const MCExpr *ScratchSize, uint64_t CodeSize, + const AMDGPUMachineFunction *MFI); void emitResourceUsageRemarks(const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo, bool isModuleEntryFunction, bool hasMAIInsts); - uint16_t getAmdhsaKernelCodeProperties( - const MachineFunction &MF) const; + const MCExpr *getAmdhsaKernelCodeProperties(const MachineFunction &MF) const; AMDGPU::MCKernelDescriptor getAmdhsaKernelDescriptor(const MachineFunction &MF, @@ -78,7 +82,7 @@ private: void initTargetStreamer(Module &M); - static uint64_t getMCExprValue(const MCExpr *Value, MCContext &Ctx); + SmallString<128> getMCExprStr(const MCExpr *Value); public: explicit AMDGPUAsmPrinter(TargetMachine &TM, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 1d645002b1fe..d7ef6f3c5dc4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -249,63 +249,54 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { default: return; - case Intrinsic::amdgcn_buffer_atomic_add: case Intrinsic::amdgcn_struct_buffer_atomic_add: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: case Intrinsic::amdgcn_raw_buffer_atomic_add: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: Op = AtomicRMWInst::Add; break; - case Intrinsic::amdgcn_buffer_atomic_sub: case Intrinsic::amdgcn_struct_buffer_atomic_sub: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: case Intrinsic::amdgcn_raw_buffer_atomic_sub: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: Op = AtomicRMWInst::Sub; break; - case Intrinsic::amdgcn_buffer_atomic_and: case Intrinsic::amdgcn_struct_buffer_atomic_and: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: case Intrinsic::amdgcn_raw_buffer_atomic_and: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: Op = AtomicRMWInst::And; break; - case Intrinsic::amdgcn_buffer_atomic_or: case Intrinsic::amdgcn_struct_buffer_atomic_or: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: case Intrinsic::amdgcn_raw_buffer_atomic_or: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: Op = AtomicRMWInst::Or; break; - case Intrinsic::amdgcn_buffer_atomic_xor: case Intrinsic::amdgcn_struct_buffer_atomic_xor: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: case Intrinsic::amdgcn_raw_buffer_atomic_xor: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: Op = AtomicRMWInst::Xor; break; - case Intrinsic::amdgcn_buffer_atomic_smin: case Intrinsic::amdgcn_struct_buffer_atomic_smin: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: case Intrinsic::amdgcn_raw_buffer_atomic_smin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: Op = AtomicRMWInst::Min; break; - case Intrinsic::amdgcn_buffer_atomic_umin: case Intrinsic::amdgcn_struct_buffer_atomic_umin: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: case Intrinsic::amdgcn_raw_buffer_atomic_umin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: Op = AtomicRMWInst::UMin; break; - case Intrinsic::amdgcn_buffer_atomic_smax: case Intrinsic::amdgcn_struct_buffer_atomic_smax: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: case Intrinsic::amdgcn_raw_buffer_atomic_smax: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: Op = AtomicRMWInst::Max; break; - case Intrinsic::amdgcn_buffer_atomic_umax: case Intrinsic::amdgcn_struct_buffer_atomic_umax: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: case Intrinsic::amdgcn_raw_buffer_atomic_umax: @@ -413,7 +404,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, assert(ST->hasPermLaneX16()); V = B.CreateBitCast(V, IntNTy); Value *Permlanex16Call = B.CreateIntrinsic( - Intrinsic::amdgcn_permlanex16, {}, + V->getType(), Intrinsic::amdgcn_permlanex16, {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), B.CreateBitCast(Permlanex16Call, AtomicTy)); @@ -425,7 +416,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, // Reduce across the upper and lower 32 lanes. V = B.CreateBitCast(V, IntNTy); Value *Permlane64Call = - B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V); + B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V); return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), B.CreateBitCast(Permlane64Call, AtomicTy)); } @@ -433,7 +424,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and // combine them with a scalar operation. Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty()); V = B.CreateBitCast(V, IntNTy); Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); @@ -481,7 +472,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, assert(ST->hasPermLaneX16()); V = B.CreateBitCast(V, IntNTy); Value *PermX = B.CreateIntrinsic( - Intrinsic::amdgcn_permlanex16, {}, + V->getType(), Intrinsic::amdgcn_permlanex16, {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); Value *UpdateDPPCall = @@ -523,10 +514,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); } else { - Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); - Function *WriteLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + Function *ReadLane = Intrinsic::getDeclaration( + M, Intrinsic::amdgcn_readlane, B.getInt32Ty()); + Function *WriteLane = Intrinsic::getDeclaration( + M, Intrinsic::amdgcn_writelane, B.getInt32Ty()); // On GFX10 all DPP operations are confined to a single row. To get cross- // row operations we have to use permlane or readlane. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 231db188e65d..537d3a43aa9f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -271,11 +271,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>; // FIXME: Check MMO is atomic def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>; def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>; -def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, SIatomic_fmin>; -def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, SIatomic_fmax>; -def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, atomic_load_fmin_glue>; -def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, atomic_load_fmax_glue>; - +def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>; +def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SWAP, SIbuffer_atomic_swap>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_ADD, SIbuffer_atomic_add>; @@ -290,7 +287,6 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>; -def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD_BF16, SIbuffer_atomic_fadd_bf16>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp index a0c6bf7cc31c..fb258547e8fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp @@ -46,8 +46,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); - AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addRequired<MachinePostDominatorTreeWrapperPass>(); AU.addRequired<MachineUniformityAnalysisPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -192,8 +192,8 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, "AMDGPU GlobalISel divergence lowering", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, "AMDGPU GlobalISel divergence lowering", false, false) @@ -209,8 +209,10 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() { bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction( MachineFunction &MF) { - MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>(); - MachinePostDominatorTree &PDT = getAnalysis<MachinePostDominatorTree>(); + MachineDominatorTree &DT = + getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); + MachinePostDominatorTree &PDT = + getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); MachineUniformityInfo &MUI = getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 7ab9ba285133..efe47b2c3eed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -464,16 +464,6 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); const Function &F = MF.getFunction(); - auto GetMCExprValue = [&MF](const MCExpr *Value) { - int64_t Val; - if (!Value->evaluateAsAbsolute(Val)) { - MCContext &Ctx = MF.getContext(); - Ctx.reportError(SMLoc(), "could not resolve expression when required."); - Val = 0; - } - return static_cast<uint64_t>(Val); - }; - auto Kern = HSAMetadataDoc->getMapNode(); Align MaxKernArgAlign; @@ -481,11 +471,12 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, STM.getKernArgSegmentSize(F, MaxKernArgAlign)); Kern[".group_segment_fixed_size"] = Kern.getDocument()->getNode(ProgramInfo.LDSSize); - Kern[".private_segment_fixed_size"] = - Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.ScratchSize)); + DelayedExprs->assignDocNode(Kern[".private_segment_fixed_size"], + msgpack::Type::UInt, ProgramInfo.ScratchSize); if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5) { - Kern[".uses_dynamic_stack"] = Kern.getDocument()->getNode( - static_cast<bool>(GetMCExprValue(ProgramInfo.DynamicCallStack))); + DelayedExprs->assignDocNode(Kern[".uses_dynamic_stack"], + msgpack::Type::Boolean, + ProgramInfo.DynamicCallStack); } if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5 && STM.supportsWGP()) @@ -497,15 +488,15 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value()); Kern[".wavefront_size"] = Kern.getDocument()->getNode(STM.getWavefrontSize()); - Kern[".sgpr_count"] = - Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumSGPR)); - Kern[".vgpr_count"] = - Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumVGPR)); + DelayedExprs->assignDocNode(Kern[".sgpr_count"], msgpack::Type::UInt, + ProgramInfo.NumSGPR); + DelayedExprs->assignDocNode(Kern[".vgpr_count"], msgpack::Type::UInt, + ProgramInfo.NumVGPR); // Only add AGPR count to metadata for supported devices if (STM.hasMAIInsts()) { - Kern[".agpr_count"] = - Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumAccVGPR)); + DelayedExprs->assignDocNode(Kern[".agpr_count"], msgpack::Type::UInt, + ProgramInfo.NumAccVGPR); } Kern[".max_flat_workgroup_size"] = @@ -527,6 +518,7 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, } bool MetadataStreamerMsgPackV4::emitTo(AMDGPUTargetStreamer &TargetStreamer) { + DelayedExprs->resolveDelayedExpressions(); return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true); } @@ -536,9 +528,11 @@ void MetadataStreamerMsgPackV4::begin(const Module &Mod, emitTargetID(TargetID); emitPrintf(Mod); getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode(); + DelayedExprs->clear(); } void MetadataStreamerMsgPackV4::end() { + DelayedExprs->resolveDelayedExpressions(); std::string HSAMetadataString; raw_string_ostream StrOS(HSAMetadataString); HSAMetadataDoc->toYAML(StrOS); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 0e3bc63919f0..fd76666dc360 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H +#include "Utils/AMDGPUDelayedMCExpr.h" #include "llvm/BinaryFormat/MsgPackDocument.h" #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/Alignment.h" @@ -65,6 +66,9 @@ protected: class LLVM_EXTERNAL_VISIBILITY MetadataStreamerMsgPackV4 : public MetadataStreamer { protected: + std::unique_ptr<DelayedMCExprs> DelayedExprs = + std::make_unique<DelayedMCExprs>(); + std::unique_ptr<msgpack::Document> HSAMetadataDoc = std::make_unique<msgpack::Document>(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 57769fe998d1..86f28a505769 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -1482,9 +1482,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { MFMAChains = 0; for (auto &MFMAPipeSU : MFMAPipeSUs) { - if (MFMAChainSeeds.size() && - std::find(MFMAChainSeeds.begin(), MFMAChainSeeds.end(), MFMAPipeSU) != - MFMAChainSeeds.end()) + if (is_contained(MFMAChainSeeds, MFMAPipeSU)) continue; if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(), [&TII](SDep &Succ) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b50c0cc12626..6d5ffc66d98b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -502,9 +502,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { // isa<MemSDNode> almost works but is slightly too permissive for some DS // intrinsics. - if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) || - Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || - Opc == AMDGPUISD::ATOMIC_LOAD_FMAX) { + if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) { N = glueCopyToM0LDSInit(N); SelectCode(N); return; @@ -2006,12 +2004,31 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return true; } +// For unbuffered smem loads, it is illegal for the Immediate Offset to be +// negative if the resulting (Offset + (M0 or SOffset or zero) is negative. +// Handle the case where the Immediate Offset + SOffset is negative. +bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset, + bool Imm32Only, + bool IsBuffer, + int64_t ImmOffset) const { + if (!IsBuffer && !Imm32Only && ImmOffset < 0 && + AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) { + KnownBits SKnown = CurDAG->computeKnownBits(*SOffset); + if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0) + return false; + } + + return true; +} + // Match an immediate (if Offset is not null) or an SGPR (if SOffset is // not null) offset. If Imm32Only is true, match only 32-bit immediate // offsets available on CI. bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, - bool Imm32Only, bool IsBuffer) const { + bool Imm32Only, bool IsBuffer, + bool HasSOffset, + int64_t ImmOffset) const { assert((!SOffset || !Offset) && "Cannot match both soffset and offset at the same time!"); @@ -2019,15 +2036,18 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, if (!C) { if (!SOffset) return false; + if (ByteOffsetNode.getValueType().isScalarInteger() && ByteOffsetNode.getValueType().getSizeInBits() == 32) { *SOffset = ByteOffsetNode; - return true; + return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer, + ImmOffset); } if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) { if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) { *SOffset = ByteOffsetNode.getOperand(0); - return true; + return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer, + ImmOffset); } } return false; @@ -2038,8 +2058,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, // GFX9 and GFX10 have signed byte immediate offsets. The immediate // offset for S_BUFFER instructions is unsigned. int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue(); - std::optional<int64_t> EncodedOffset = - AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer); + std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset( + *Subtarget, ByteOffset, IsBuffer, HasSOffset); if (EncodedOffset && Offset && !Imm32Only) { *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); return true; @@ -2098,13 +2118,22 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { // true, match only 32-bit immediate offsets available on CI. bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, - bool Imm32Only, - bool IsBuffer) const { + bool Imm32Only, bool IsBuffer, + bool HasSOffset, + int64_t ImmOffset) const { if (SOffset && Offset) { assert(!Imm32Only && !IsBuffer); SDValue B; - return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) && - SelectSMRDBaseOffset(B, SBase, SOffset, nullptr); + + if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true)) + return false; + + int64_t ImmOff = 0; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset)) + ImmOff = C->getSExtValue(); + + return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true, + ImmOff); } // A 32-bit (address + offset) should not cause unsigned 32-bit integer @@ -2123,11 +2152,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, } if (!N0 || !N1) return false; - if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) { + + if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset)) { SBase = N0; return true; } - if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) { + if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset)) { SBase = N1; return true; } @@ -2551,14 +2583,6 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) { CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); } -void AMDGPUDAGToDAGISel::SelectPOPSExitingWaveID(SDNode *N) { - // TODO: Select this with a tablegen pattern. This is tricky because the - // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked - // mayLoad/mayStore and tablegen complains about the mismatch. - SDValue Reg = CurDAG->getRegister(AMDGPU::SRC_POPS_EXITING_WAVE_ID, MVT::i32); - CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, N->getVTList(), Reg); -} - static unsigned gwsIntrinToOpcode(unsigned IntrID) { switch (IntrID) { case Intrinsic::amdgcn_ds_gws_init: @@ -2715,9 +2739,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { case Intrinsic::amdgcn_ds_bvh_stack_rtn: SelectDSBvhStackIntrinsic(N); return; - case Intrinsic::amdgcn_pops_exiting_wave_id: - SelectPOPSExitingWaveID(N); - return; } SelectCode(N); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 8e5662a3cd81..e7911bc1793d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -24,10 +24,6 @@ using namespace llvm; namespace { -static inline bool isNullConstantOrUndef(SDValue V) { - return V.isUndef() || isNullConstant(V); -} - static inline bool getConstantValue(SDValue N, uint32_t &Out) { // This is only used for packed vectors, where using 0 for undef should // always be good. @@ -136,6 +132,8 @@ private: bool isFlatScratchBaseLegal(SDValue Addr) const; bool isFlatScratchBaseLegalSV(SDValue Addr) const; bool isFlatScratchBaseLegalSVImm(SDValue Addr) const; + bool isSOffsetLegalWithImmOffset(SDValue *SOffset, bool Imm32Only, + bool IsBuffer, int64_t ImmOffset = 0) const; bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, @@ -178,11 +176,13 @@ private: bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false, - bool IsBuffer = false) const; + bool IsBuffer = false, bool HasSOffset = false, + int64_t ImmOffset = 0) const; SDValue Expand32BitAddress(SDValue Addr) const; bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false, - bool IsBuffer = false) const; + bool IsBuffer = false, bool HasSOffset = false, + int64_t ImmOffset = 0) const; bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; @@ -194,6 +194,8 @@ private: bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const; bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, SDValue &Offset) const; + bool SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase, + SDValue &Offset) const; bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods, @@ -267,7 +269,6 @@ private: void SelectFP_EXTEND(SDNode *N); void SelectDSAppendConsume(SDNode *N, unsigned IntrID); void SelectDSBvhStackIntrinsic(SDNode *N); - void SelectPOPSExitingWaveID(SDNode *N); void SelectDS_GWS(SDNode *N, unsigned IntrID); void SelectInterpP1F16(SDNode *N); void SelectINTRINSIC_W_CHAIN(SDNode *N); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 375643b7f519..522b3a34161c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -42,8 +42,10 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { if (StoreSize <= 32) return EVT::getIntegerVT(Ctx, StoreSize); - assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); - return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); + if (StoreSize % 32 == 0) + return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); + + return VT; } unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) { @@ -5522,8 +5524,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(DS_ORDERED_COUNT) NODE_NAME_CASE(ATOMIC_CMP_SWAP) - NODE_NAME_CASE(ATOMIC_LOAD_FMIN) - NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_UBYTE) NODE_NAME_CASE(BUFFER_LOAD_USHORT) @@ -5562,7 +5562,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) - NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16) NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 71c4334029b4..37572af3897f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -575,8 +575,6 @@ enum NodeType : unsigned { TBUFFER_LOAD_FORMAT_D16, DS_ORDERED_COUNT, ATOMIC_CMP_SWAP, - ATOMIC_LOAD_FMIN, - ATOMIC_LOAD_FMAX, BUFFER_LOAD, BUFFER_LOAD_UBYTE, BUFFER_LOAD_USHORT, @@ -615,7 +613,6 @@ enum NodeType : unsigned { BUFFER_ATOMIC_CMPSWAP, BUFFER_ATOMIC_CSUB, BUFFER_ATOMIC_FADD, - BUFFER_ATOMIC_FADD_BF16, BUFFER_ATOMIC_FMIN, BUFFER_ATOMIC_FMAX, BUFFER_ATOMIC_COND_SUB_U32, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp index b78952ca3a62..43b3bf43fe56 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUGenSearchableTables.inc" #include "GCNSubtarget.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" @@ -214,12 +215,14 @@ public: RegisterUseCount[Unit]++; // Do not attempt to optimise across exec mask changes. - if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { + if (MI.modifiesRegister(AMDGPU::EXEC, TRI) || + AMDGPU::isInvalidSingleUseConsumerInst(MI.getOpcode())) { for (auto &UsedReg : RegisterUseCount) UsedReg.second = 2; } - if (!SIInstrInfo::isVALU(MI)) + if (!SIInstrInfo::isVALU(MI) || + AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode())) continue; if (AllProducerOperandsAreSingleUse) { SingleUseProducerPositions.push_back({VALUInstrCount, &MI}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 160a17584ca3..93bca4402ed2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1158,12 +1158,10 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); break; } - case Intrinsic::amdgcn_buffer_store_format: case Intrinsic::amdgcn_raw_buffer_store_format: case Intrinsic::amdgcn_struct_buffer_store_format: case Intrinsic::amdgcn_raw_tbuffer_store: case Intrinsic::amdgcn_struct_tbuffer_store: - case Intrinsic::amdgcn_tbuffer_store: case Intrinsic::amdgcn_image_store_1d: case Intrinsic::amdgcn_image_store_1darray: case Intrinsic::amdgcn_image_store_2d: @@ -1376,8 +1374,6 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( std::function<void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const { switch (II.getIntrinsicID()) { - case Intrinsic::amdgcn_buffer_load: - case Intrinsic::amdgcn_buffer_load_format: case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_ptr_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: @@ -1391,7 +1387,6 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( case Intrinsic::amdgcn_struct_ptr_buffer_load_format: case Intrinsic::amdgcn_struct_tbuffer_load: case Intrinsic::amdgcn_struct_ptr_tbuffer_load: - case Intrinsic::amdgcn_tbuffer_load: return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); default: { if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index ae3f2b87f353..a3cb3b3f47e0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2079,21 +2079,6 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic( return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } -bool AMDGPUInstructionSelector::selectPOPSExitingWaveID( - MachineInstr &MI) const { - Register Dst = MI.getOperand(0).getReg(); - const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock *MBB = MI.getParent(); - - // TODO: Select this with a tablegen pattern. This is tricky because the - // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked - // mayLoad/mayStore and tablegen complains about the mismatch. - auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) - .addReg(AMDGPU::SRC_POPS_EXITING_WAVE_ID); - MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); -} - bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( MachineInstr &I) const { Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID(); @@ -2144,8 +2129,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( return selectSBarrierSignalIsfirst(I, IntrinsicID); case Intrinsic::amdgcn_s_barrier_leave: return selectSBarrierLeave(I); - case Intrinsic::amdgcn_pops_exiting_wave_id: - return selectPOPSExitingWaveID(I); } return selectImpl(I, *CoverageInfo); } @@ -3620,8 +3603,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_ATOMICRMW_UINC_WRAP: case TargetOpcode::G_ATOMICRMW_UDEC_WRAP: case TargetOpcode::G_ATOMICRMW_FADD: - case AMDGPU::G_AMDGPU_ATOMIC_FMIN: - case AMDGPU::G_AMDGPU_ATOMIC_FMAX: + case TargetOpcode::G_ATOMICRMW_FMIN: + case TargetOpcode::G_ATOMICRMW_FMAX: return selectG_LOAD_STORE_ATOMICRMW(I); case TargetOpcode::G_SELECT: return selectG_SELECT(I); @@ -4216,10 +4199,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, return false; const GEPInfo &GEPI = AddrInfo[0]; - std::optional<int64_t> EncodedImm = - AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false); + std::optional<int64_t> EncodedImm; if (SOffset && Offset) { + EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, + /*HasSOffset=*/true); if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm && AddrInfo.size() > 1) { const GEPInfo &GEPI2 = AddrInfo[1]; @@ -4229,6 +4213,17 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, Base = GEPI2.SgprParts[0]; *SOffset = OffsetReg; *Offset = *EncodedImm; + if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI)) + return true; + + // For unbuffered smem loads, it is illegal for the Immediate Offset + // to be negative if the resulting (Offset + (M0 or SOffset or zero) + // is negative. Handle the case where the Immediate Offset + SOffset + // is negative. + auto SKnown = KB->getKnownBits(*SOffset); + if (*Offset + SKnown.getMinValue().getSExtValue() < 0) + return false; + return true; } } @@ -4236,6 +4231,8 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, return false; } + EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, + /*HasSOffset=*/false); if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) { Base = GEPI.SgprParts[0]; *Offset = *EncodedImm; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 48f3b1811801..f561d5d29efc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -125,7 +125,6 @@ private: bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const; bool selectSBarrier(MachineInstr &MI) const; bool selectDSBvhStackIntrinsic(MachineInstr &MI) const; - bool selectPOPSExitingWaveID(MachineInstr &MI) const; bool selectImageIntrinsic(MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index fa7492ac6cbe..c6dbc58395e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -140,7 +140,9 @@ class ImmOperand<ValueType type, string name = NAME, bit optional = 0, let PrintMethod = printer; } -def s16imm : ImmOperand<i16, "S16Imm", 0, "printU16ImmOperand">; +class S16ImmOperand : ImmOperand<i16, "S16Imm", 0, "printU16ImmOperand">; + +def s16imm : S16ImmOperand; def u16imm : ImmOperand<i16, "U16Imm", 0, "printU16ImmOperand">; class ValuePredicatedOperand<CustomOperand op, string valuePredicate, @@ -616,6 +618,7 @@ multiclass local_addr_space_atomic_op { } } +defm int_amdgcn_flat_atomic_fadd : noret_op; defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op; defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op; defm int_amdgcn_flat_atomic_fmin : noret_op; @@ -627,7 +630,6 @@ defm int_amdgcn_global_atomic_fmin : noret_op; defm int_amdgcn_global_atomic_fmax : noret_op; defm int_amdgcn_global_atomic_csub : noret_op; defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op; -defm int_amdgcn_ds_fadd_v2bf16 : noret_op; defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op; defm int_amdgcn_flat_atomic_fmin_num : noret_op; defm int_amdgcn_flat_atomic_fmax_num : noret_op; @@ -637,9 +639,14 @@ defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op; defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op; defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op; -multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { +multiclass noret_binary_atomic_op<SDNode atomic_op> { let HasNoUse = true in - defm "_noret" : binary_atomic_op<atomic_op, IsInt>; + defm "_noret" : binary_atomic_op<atomic_op>; +} + +multiclass noret_binary_atomic_op_fp<SDNode atomic_op> { + let HasNoUse = true in + defm "_noret" : binary_atomic_op_fp<atomic_op>; } multiclass noret_ternary_atomic_op<SDNode atomic_op> { @@ -647,11 +654,21 @@ multiclass noret_ternary_atomic_op<SDNode atomic_op> { defm "_noret" : ternary_atomic_op<atomic_op>; } -multiclass binary_atomic_op_all_as<SDNode atomic_op, bit IsInt = 1> { - foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { +defvar atomic_addrspace_names = [ "global", "flat", "constant", "local", "private", "region" ]; + +multiclass binary_atomic_op_all_as<SDNode atomic_op> { + foreach as = atomic_addrspace_names in { + let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { + defm "_"#as : binary_atomic_op<atomic_op>; + defm "_"#as : noret_binary_atomic_op<atomic_op>; + } + } +} +multiclass binary_atomic_op_fp_all_as<SDNode atomic_op> { + foreach as = atomic_addrspace_names in { let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { - defm "_"#as : binary_atomic_op<atomic_op, IsInt>; - defm "_"#as : noret_binary_atomic_op<atomic_op, IsInt>; + defm "_"#as : binary_atomic_op_fp<atomic_op>; + defm "_"#as : noret_binary_atomic_op_fp<atomic_op>; } } } @@ -666,11 +683,11 @@ defm atomic_load_sub : binary_atomic_op_all_as<atomic_load_sub>; defm atomic_load_umax : binary_atomic_op_all_as<atomic_load_umax>; defm atomic_load_umin : binary_atomic_op_all_as<atomic_load_umin>; defm atomic_load_xor : binary_atomic_op_all_as<atomic_load_xor>; -defm atomic_load_fadd : binary_atomic_op_all_as<atomic_load_fadd, 0>; +defm atomic_load_fadd : binary_atomic_op_fp_all_as<atomic_load_fadd>; +defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>; +defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>; defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>; defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>; -let MemoryVT = v2f16 in -defm atomic_load_fadd_v2f16 : binary_atomic_op_all_as<atomic_load_fadd, 0>; defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>; def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index ee7fb20c23aa..f1254b2e9e1d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -283,7 +283,9 @@ static const LLT S1 = LLT::scalar(1); static const LLT S8 = LLT::scalar(8); static const LLT S16 = LLT::scalar(16); static const LLT S32 = LLT::scalar(32); +static const LLT F32 = LLT::float32(); static const LLT S64 = LLT::scalar(64); +static const LLT F64 = LLT::float64(); static const LLT S96 = LLT::scalar(96); static const LLT S128 = LLT::scalar(128); static const LLT S160 = LLT::scalar(160); @@ -301,6 +303,9 @@ static const LLT V10S16 = LLT::fixed_vector(10, 16); static const LLT V12S16 = LLT::fixed_vector(12, 16); static const LLT V16S16 = LLT::fixed_vector(16, 16); +static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16()); +static const LLT V2BF16 = V2F16; // FIXME + static const LLT V2S32 = LLT::fixed_vector(2, 32); static const LLT V3S32 = LLT::fixed_vector(3, 32); static const LLT V4S32 = LLT::fixed_vector(4, 32); @@ -1638,13 +1643,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasLdsAtomicAddF64()) Atomic.legalFor({{S64, LocalPtr}}); if (ST.hasAtomicDsPkAdd16Insts()) - Atomic.legalFor({{V2S16, LocalPtr}}); + Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}}); } if (ST.hasAtomicFaddInsts()) Atomic.legalFor({{S32, GlobalPtr}}); if (ST.hasFlatAtomicFaddF32Inst()) Atomic.legalFor({{S32, FlatPtr}}); + getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX}) + .legalFor({{F32, LocalPtr}, {F64, LocalPtr}}); + if (ST.hasGFX90AInsts()) { // These are legal with some caveats, and should have undergone expansion in // the IR in most situations @@ -1656,6 +1664,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }); } + if (ST.hasAtomicBufferGlobalPkAddF16Insts()) + Atomic.legalFor({{V2F16, GlobalPtr}}); + if (ST.hasAtomicGlobalPkAddBF16Inst()) + Atomic.legalFor({{V2BF16, GlobalPtr}}); + if (ST.hasAtomicFlatPkAdd16Insts()) + Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}}); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output // demarshalling getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) @@ -5388,12 +5403,10 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { switch (IID) { - case Intrinsic::amdgcn_ds_fadd: - return AMDGPU::G_ATOMICRMW_FADD; case Intrinsic::amdgcn_ds_fmin: - return AMDGPU::G_AMDGPU_ATOMIC_FMIN; + return AMDGPU::G_ATOMICRMW_FMIN; case Intrinsic::amdgcn_ds_fmax: - return AMDGPU::G_AMDGPU_ATOMIC_FMAX; + return AMDGPU::G_ATOMICRMW_FMAX; default: llvm_unreachable("not a DS FP intrinsic"); } @@ -5417,6 +5430,126 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, return true; } +// TODO: Fix pointer type handling +bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, + MachineInstr &MI, + Intrinsic::ID IID) const { + + MachineIRBuilder &B = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *B.getMRI(); + + bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || + IID == Intrinsic::amdgcn_permlanex16; + + auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1, + Register Src2, LLT VT) -> Register { + auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0); + switch (IID) { + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_permlane64: + return LaneOp.getReg(0); + case Intrinsic::amdgcn_readlane: + return LaneOp.addUse(Src1).getReg(0); + case Intrinsic::amdgcn_writelane: + return LaneOp.addUse(Src1).addUse(Src2).getReg(0); + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: { + Register Src3 = MI.getOperand(5).getReg(); + Register Src4 = MI.getOperand(6).getImm(); + Register Src5 = MI.getOperand(7).getImm(); + return LaneOp.addUse(Src1) + .addUse(Src2) + .addUse(Src3) + .addImm(Src4) + .addImm(Src5) + .getReg(0); + } + default: + llvm_unreachable("unhandled lane op"); + } + }; + + Register DstReg = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(2).getReg(); + Register Src1, Src2; + if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || + IsPermLane16) { + Src1 = MI.getOperand(3).getReg(); + if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) { + Src2 = MI.getOperand(4).getReg(); + } + } + + LLT Ty = MRI.getType(DstReg); + unsigned Size = Ty.getSizeInBits(); + + if (Size == 32) { + // Already legal + return true; + } + + if (Size < 32) { + Src0 = B.buildAnyExt(S32, Src0).getReg(0); + + if (IsPermLane16) + Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0); + + if (IID == Intrinsic::amdgcn_writelane) + Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0); + + Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32); + B.buildTrunc(DstReg, LaneOpDst); + MI.eraseFromParent(); + return true; + } + + if (Size % 32 != 0) + return false; + + LLT PartialResTy = S32; + if (Ty.isVector()) { + LLT EltTy = Ty.getElementType(); + switch (EltTy.getSizeInBits()) { + case 16: + PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2)); + break; + case 32: + PartialResTy = EltTy; + break; + default: + // Handle all other cases via S32 pieces; + break; + } + } + + SmallVector<Register, 2> PartialRes; + unsigned NumParts = Size / 32; + MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0); + MachineInstrBuilder Src1Parts, Src2Parts; + + if (IsPermLane16) + Src1Parts = B.buildUnmerge(PartialResTy, Src1); + + if (IID == Intrinsic::amdgcn_writelane) + Src2Parts = B.buildUnmerge(PartialResTy, Src2); + + for (unsigned i = 0; i < NumParts; ++i) { + Src0 = Src0Parts.getReg(i); + + if (IsPermLane16) + Src1 = Src1Parts.getReg(i); + + if (IID == Intrinsic::amdgcn_writelane) + Src2 = Src2Parts.getReg(i); + + PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy)); + } + + B.buildMergeLikeInstr(DstReg, PartialRes); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -6008,9 +6141,6 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; - case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: - case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: - return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16; case Intrinsic::amdgcn_raw_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: case Intrinsic::amdgcn_struct_buffer_atomic_fmin: @@ -6630,9 +6760,9 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper, MI.removeOperand(1); // Remove intrinsic ID // FIXME: When intrinsic definition is fixed, this should have an MMO already. - // TODO: Should this use datalayout alignment? const unsigned MemSize = (Size + 7) / 8; - const Align MemAlign(std::min(MemSize, 4u)); + const Align MemAlign = B.getDataLayout().getABITypeAlign( + getTypeForLLT(Ty, MF.getFunction().getContext())); MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo(), MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | @@ -7318,14 +7448,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: - case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: - case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16: - case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: - case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16: return legalizeBufferAtomic(MI, B, IntrID); case Intrinsic::amdgcn_rsq_clamp: return legalizeRsqClampIntrinsic(MI, MRI, B); - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); @@ -7365,6 +7490,13 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, Observer.changedInstr(MI); return true; } + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_writelane: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: + case Intrinsic::amdgcn_permlane64: + return legalizeLaneOp(Helper, MI, IntrID); default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 4b1d821dadc2..ae01bb29c110 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -210,6 +210,9 @@ public: bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const; + bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, + Intrinsic::ID IID) const; + bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index c515138d95a2..456f3cb332cf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -1129,15 +1129,11 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, nval = CreateCallEx(B,ExpExpr, nval, "__exp2"); if (needcopysign) { - Value *opr_n; - Type* rTy = opr0->getType(); Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits()); - Type *nTy = nTyS; - if (const auto *vTy = dyn_cast<FixedVectorType>(rTy)) - nTy = FixedVectorType::get(nTyS, vTy); + Type *nTy = FPOp->getType()->getWithNewType(nTyS); unsigned size = nTy->getScalarSizeInBits(); - opr_n = FPOp->getOperand(1); - if (opr_n->getType()->isIntegerTy()) + Value *opr_n = FPOp->getOperand(1); + if (opr_n->getType()->getScalarType()->isIntegerTy()) opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou"); else opr_n = B.CreateFPToSI(opr1, nTy, "__ytou"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index f878bd9465d3..a8f6ad09fe28 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -200,6 +200,7 @@ #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/Utils/Local.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/AttributeMask.h" #include "llvm/IR/Constants.h" @@ -214,6 +215,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ReplaceConstant.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/AtomicOrdering.h" @@ -578,18 +580,14 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) { /// buffer fat pointer constant. static std::pair<Constant *, Constant *> splitLoweredFatBufferConst(Constant *C) { - if (auto *AZ = dyn_cast<ConstantAggregateZero>(C)) - return std::make_pair(AZ->getStructElement(0), AZ->getStructElement(1)); - if (auto *SC = dyn_cast<ConstantStruct>(C)) - return std::make_pair(SC->getOperand(0), SC->getOperand(1)); - llvm_unreachable("Conversion should've created a {p8, i32} struct"); + assert(isSplitFatPtr(C->getType()) && "Not a split fat buffer pointer"); + return std::make_pair(C->getAggregateElement(0u), C->getAggregateElement(1u)); } namespace { /// Handle the remapping of ptr addrspace(7) constants. class FatPtrConstMaterializer final : public ValueMaterializer { BufferFatPtrToStructTypeMap *TypeMap; - BufferFatPtrToIntTypeMap *IntTypeMap; // An internal mapper that is used to recurse into the arguments of constants. // While the documentation for `ValueMapper` specifies not to use it // recursively, examination of the logic in mapValue() shows that it can @@ -599,16 +597,12 @@ class FatPtrConstMaterializer final : public ValueMaterializer { Constant *materializeBufferFatPtrConst(Constant *C); - const DataLayout &DL; - public: // UnderlyingMap is the value map this materializer will be filling. FatPtrConstMaterializer(BufferFatPtrToStructTypeMap *TypeMap, - ValueToValueMapTy &UnderlyingMap, - BufferFatPtrToIntTypeMap *IntTypeMap, - const DataLayout &DL) - : TypeMap(TypeMap), IntTypeMap(IntTypeMap), - InternalMapper(UnderlyingMap, RF_None, TypeMap, this), DL(DL) {} + ValueToValueMapTy &UnderlyingMap) + : TypeMap(TypeMap), + InternalMapper(UnderlyingMap, RF_None, TypeMap, this) {} virtual ~FatPtrConstMaterializer() = default; Value *materialize(Value *V) override; @@ -631,10 +625,6 @@ Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) { UndefValue::get(NewTy->getElementType(1))}); } - if (isa<GlobalValue>(C)) - report_fatal_error("Global values containing ptr addrspace(7) (buffer " - "fat pointer) values are not supported"); - if (auto *VC = dyn_cast<ConstantVector>(C)) { if (Constant *S = VC->getSplatValue()) { Constant *NewS = InternalMapper.mapConstant(*S); @@ -660,127 +650,14 @@ Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) { return ConstantStruct::get(NewTy, {RsrcVec, OffVec}); } - // Constant expressions. This code mirrors how we fix up the equivalent - // instructions later. - auto *CE = dyn_cast<ConstantExpr>(C); - if (!CE) - return nullptr; - if (auto *GEPO = dyn_cast<GEPOperator>(C)) { - Constant *RemappedPtr = - InternalMapper.mapConstant(*cast<Constant>(GEPO->getPointerOperand())); - auto [Rsrc, Off] = splitLoweredFatBufferConst(RemappedPtr); - Type *OffTy = Off->getType(); - bool InBounds = GEPO->isInBounds(); - - MapVector<Value *, APInt> VariableOffs; - APInt NewConstOffVal = APInt::getZero(BufferOffsetWidth); - if (!GEPO->collectOffset(DL, BufferOffsetWidth, VariableOffs, - NewConstOffVal)) - report_fatal_error( - "Scalable vector or unsized struct in fat pointer GEP"); - Constant *OffAccum = nullptr; - // Accumulate offsets together before adding to the base in order to - // preserve as many of the inbounds properties as possible. - for (auto [Arg, Multiple] : VariableOffs) { - Constant *NewArg = InternalMapper.mapConstant(*cast<Constant>(Arg)); - NewArg = ConstantFoldIntegerCast(NewArg, OffTy, /*IsSigned=*/true, DL); - if (!Multiple.isOne()) { - if (Multiple.isPowerOf2()) { - NewArg = ConstantExpr::getShl( - NewArg, - CE->getIntegerValue( - OffTy, APInt(BufferOffsetWidth, Multiple.logBase2())), - /*hasNUW=*/InBounds, /*HasNSW=*/InBounds); - } else { - NewArg = - ConstantExpr::getMul(NewArg, CE->getIntegerValue(OffTy, Multiple), - /*hasNUW=*/InBounds, /*hasNSW=*/InBounds); - } - } - if (OffAccum) { - OffAccum = ConstantExpr::getAdd(OffAccum, NewArg, /*hasNUW=*/InBounds, - /*hasNSW=*/InBounds); - } else { - OffAccum = NewArg; - } - } - Constant *NewConstOff = CE->getIntegerValue(OffTy, NewConstOffVal); - if (OffAccum) - OffAccum = ConstantExpr::getAdd(OffAccum, NewConstOff, - /*hasNUW=*/InBounds, /*hasNSW=*/InBounds); - else - OffAccum = NewConstOff; - bool HasNonNegativeOff = false; - if (auto *CI = dyn_cast<ConstantInt>(OffAccum)) { - HasNonNegativeOff = !CI->isNegative(); - } - Constant *NewOff = ConstantExpr::getAdd( - Off, OffAccum, /*hasNUW=*/InBounds && HasNonNegativeOff, - /*hasNSW=*/false); - return ConstantStruct::get(NewTy, {Rsrc, NewOff}); - } - - if (auto *PI = dyn_cast<PtrToIntOperator>(CE)) { - Constant *Parts = - InternalMapper.mapConstant(*cast<Constant>(PI->getPointerOperand())); - auto [Rsrc, Off] = splitLoweredFatBufferConst(Parts); - // Here, we take advantage of the fact that ptrtoint has a built-in - // zero-extension behavior. - unsigned FatPtrWidth = - DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); - Constant *RsrcInt = CE->getPtrToInt(Rsrc, SrcTy); - unsigned Width = SrcTy->getScalarSizeInBits(); - Constant *Shift = - CE->getIntegerValue(SrcTy, APInt(Width, BufferOffsetWidth)); - Constant *OffCast = - ConstantFoldIntegerCast(Off, SrcTy, /*IsSigned=*/false, DL); - Constant *RsrcHi = ConstantExpr::getShl( - RsrcInt, Shift, Width >= FatPtrWidth, Width > FatPtrWidth); - // This should be an or, but those got recently removed. - Constant *Result = ConstantExpr::getAdd(RsrcHi, OffCast, true, true); - return Result; - } + if (isa<GlobalValue>(C)) + report_fatal_error("Global values containing ptr addrspace(7) (buffer " + "fat pointer) values are not supported"); - if (CE->getOpcode() == Instruction::IntToPtr) { - auto *Arg = cast<Constant>(CE->getOperand(0)); - unsigned FatPtrWidth = - DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); - unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE); - auto *WantedTy = Arg->getType()->getWithNewBitWidth(FatPtrWidth); - Arg = ConstantFoldIntegerCast(Arg, WantedTy, /*IsSigned=*/false, DL); - - Constant *Shift = - CE->getIntegerValue(WantedTy, APInt(FatPtrWidth, BufferOffsetWidth)); - Type *RsrcIntType = WantedTy->getWithNewBitWidth(RsrcPtrWidth); - Type *RsrcTy = NewTy->getElementType(0); - Type *OffTy = WantedTy->getWithNewBitWidth(BufferOffsetWidth); - Constant *RsrcInt = CE->getTrunc( - ConstantFoldBinaryOpOperands(Instruction::LShr, Arg, Shift, DL), - RsrcIntType); - Constant *Rsrc = CE->getIntToPtr(RsrcInt, RsrcTy); - Constant *Off = ConstantFoldIntegerCast(Arg, OffTy, /*isSigned=*/false, DL); - - return ConstantStruct::get(NewTy, {Rsrc, Off}); - } + if (isa<ConstantExpr>(C)) + report_fatal_error("Constant exprs containing ptr addrspace(7) (buffer " + "fat pointer) values should have been expanded earlier"); - if (auto *AC = dyn_cast<AddrSpaceCastOperator>(CE)) { - unsigned SrcAS = AC->getSrcAddressSpace(); - unsigned DstAS = AC->getDestAddressSpace(); - auto *Arg = cast<Constant>(AC->getPointerOperand()); - auto *NewArg = InternalMapper.mapConstant(*Arg); - if (!NewArg) - return nullptr; - if (SrcAS == AMDGPUAS::BUFFER_FAT_POINTER && - DstAS == AMDGPUAS::BUFFER_FAT_POINTER) - return NewArg; - if (SrcAS == AMDGPUAS::BUFFER_RESOURCE && - DstAS == AMDGPUAS::BUFFER_FAT_POINTER) { - auto *NullOff = CE->getNullValue(NewTy->getElementType(1)); - return ConstantStruct::get(NewTy, {NewArg, NullOff}); - } - report_fatal_error( - "Unsupported address space cast for a buffer fat pointer"); - } return nullptr; } @@ -788,26 +665,6 @@ Value *FatPtrConstMaterializer::materialize(Value *V) { Constant *C = dyn_cast<Constant>(V); if (!C) return nullptr; - if (auto *GEPO = dyn_cast<GEPOperator>(C)) { - // As a special case, adjust GEP constants that have a ptr addrspace(7) in - // their source types here, since the earlier local changes didn't handle - // htis. - Type *SrcTy = GEPO->getSourceElementType(); - Type *NewSrcTy = IntTypeMap->remapType(SrcTy); - if (SrcTy != NewSrcTy) { - SmallVector<Constant *> Ops; - Ops.reserve(GEPO->getNumOperands()); - for (const Use &U : GEPO->operands()) - Ops.push_back(cast<Constant>(U.get())); - auto *NewGEP = ConstantExpr::getGetElementPtr( - NewSrcTy, Ops[0], ArrayRef<Constant *>(Ops).slice(1), - GEPO->getNoWrapFlags(), GEPO->getInRange()); - LLVM_DEBUG(dbgs() << "p7-getting GEP: " << *GEPO << " becomes " << *NewGEP - << "\n"); - Value *FurtherMap = materialize(NewGEP); - return FurtherMap ? FurtherMap : NewGEP; - } - } // Structs and other types that happen to contain fat pointers get remapped // by the mapValue() logic. if (!isBufferFatPtrConst(C)) @@ -1387,57 +1244,25 @@ PtrParts SplitPtrStructs::visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI) { } PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) { + using namespace llvm::PatternMatch; Value *Ptr = GEP.getPointerOperand(); if (!isSplitFatPtr(Ptr->getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&GEP); auto [Rsrc, Off] = getPtrParts(Ptr); - Type *OffTy = Off->getType(); const DataLayout &DL = GEP.getModule()->getDataLayout(); bool InBounds = GEP.isInBounds(); - // In order to call collectOffset() and thus not have to reimplement it, - // we need the GEP's pointer operand to have ptr addrspace(7) type - GEP.setOperand(GEP.getPointerOperandIndex(), - PoisonValue::get(IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER))); - MapVector<Value *, APInt> VariableOffs; - APInt ConstOffVal = APInt::getZero(BufferOffsetWidth); - if (!GEP.collectOffset(DL, BufferOffsetWidth, VariableOffs, ConstOffVal)) - report_fatal_error("Scalable vector or unsized struct in fat pointer GEP"); - GEP.setOperand(GEP.getPointerOperandIndex(), Ptr); - Value *OffAccum = nullptr; - // Accumulate offsets together before adding to the base in order to preserve - // as many of the inbounds properties as possible. - for (auto [Arg, Multiple] : VariableOffs) { - if (auto *OffVecTy = dyn_cast<VectorType>(OffTy)) - if (!Arg->getType()->isVectorTy()) - Arg = IRB.CreateVectorSplat(OffVecTy->getElementCount(), Arg); - Arg = IRB.CreateIntCast(Arg, OffTy, /*isSigned=*/true); - if (!Multiple.isOne()) { - if (Multiple.isPowerOf2()) - Arg = IRB.CreateShl(Arg, Multiple.logBase2(), "", /*hasNUW=*/InBounds, - /*HasNSW=*/InBounds); - else - Arg = IRB.CreateMul(Arg, ConstantExpr::getIntegerValue(OffTy, Multiple), - "", /*hasNUW=*/InBounds, /*hasNSW=*/InBounds); - } - if (OffAccum) - OffAccum = IRB.CreateAdd(OffAccum, Arg, "", /*hasNUW=*/InBounds, - /*hasNSW=*/InBounds); - else - OffAccum = Arg; - } - if (!ConstOffVal.isZero()) { - Constant *ConstOff = ConstantExpr::getIntegerValue(OffTy, ConstOffVal); - if (OffAccum) - OffAccum = IRB.CreateAdd(OffAccum, ConstOff, "", /*hasNUW=*/InBounds, - /*hasNSW=*/InBounds); - else - OffAccum = ConstOff; - } - - if (!OffAccum) { // Constant-zero offset + // In order to call emitGEPOffset() and thus not have to reimplement it, + // we need the GEP result to have ptr addrspace(7) type. + Type *FatPtrTy = IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER); + if (auto *VT = dyn_cast<VectorType>(Off->getType())) + FatPtrTy = VectorType::get(FatPtrTy, VT->getElementCount()); + GEP.mutateType(FatPtrTy); + Value *OffAccum = emitGEPOffset(&IRB, DL, &GEP); + GEP.mutateType(Ptr->getType()); + if (match(OffAccum, m_Zero())) { // Constant-zero offset SplitUsers.insert(&GEP); return {Rsrc, Off}; } @@ -1447,7 +1272,7 @@ PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) { HasNonNegativeOff = !CI->isNegative(); } Value *NewOff; - if (PatternMatch::match(Off, PatternMatch::is_zero())) { + if (match(Off, m_Zero())) { NewOff = OffAccum; } else { NewOff = IRB.CreateAdd(Off, OffAccum, "", @@ -1473,20 +1298,22 @@ PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) { const DataLayout &DL = PI.getModule()->getDataLayout(); unsigned FatPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); - Value *RsrcInt; - if (Width <= BufferOffsetWidth) - RsrcInt = ConstantExpr::getIntegerValue(ResTy, APInt::getZero(Width)); - else - RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc"); - copyMetadata(RsrcInt, &PI); - - Value *Shl = IRB.CreateShl( - RsrcInt, - ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)), "", - Width >= FatPtrWidth, Width > FatPtrWidth); - Value *OffCast = - IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, PI.getName() + ".off"); - Value *Res = IRB.CreateOr(Shl, OffCast); + Value *Res; + if (Width <= BufferOffsetWidth) { + Res = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, + PI.getName() + ".off"); + } else { + Value *RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc"); + Value *Shl = IRB.CreateShl( + RsrcInt, + ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)), + "", Width >= FatPtrWidth, Width > FatPtrWidth); + Value *OffCast = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, + PI.getName() + ".off"); + Res = IRB.CreateOr(Shl, OffCast); + } + + copyMetadata(Res, &PI); Res->takeName(&PI); SplitUsers.insert(&PI); PI.replaceAllUsesWith(Res); @@ -1818,14 +1645,9 @@ public: static bool containsBufferFatPointers(const Function &F, BufferFatPtrToStructTypeMap *TypeMap) { bool HasFatPointers = false; - for (const BasicBlock &BB : F) { - for (const Instruction &I : BB) { + for (const BasicBlock &BB : F) + for (const Instruction &I : BB) HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType())); - for (const Use &U : I.operands()) - if (auto *C = dyn_cast<Constant>(U.get())) - HasFatPointers |= isBufferFatPtrConst(C); - } - } return HasFatPointers; } @@ -1924,6 +1746,36 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { "buffer resource pointers (address space 8) instead."); } + { + // Collect all constant exprs and aggregates referenced by any function. + SmallVector<Constant *, 8> Worklist; + for (Function &F : M.functions()) + for (Instruction &I : instructions(F)) + for (Value *Op : I.operands()) + if (isa<ConstantExpr>(Op) || isa<ConstantAggregate>(Op)) + Worklist.push_back(cast<Constant>(Op)); + + // Recursively look for any referenced buffer pointer constants. + SmallPtrSet<Constant *, 8> Visited; + SetVector<Constant *> BufferFatPtrConsts; + while (!Worklist.empty()) { + Constant *C = Worklist.pop_back_val(); + if (!Visited.insert(C).second) + continue; + if (isBufferFatPtrOrVector(C->getType())) + BufferFatPtrConsts.insert(C); + for (Value *Op : C->operands()) + if (isa<ConstantExpr>(Op) || isa<ConstantAggregate>(Op)) + Worklist.push_back(cast<Constant>(Op)); + } + + // Expand all constant expressions using fat buffer pointers to + // instructions. + Changed |= convertUsersOfConstantsToInstructions( + BufferFatPtrConsts.getArrayRef(), /*RestrictToFunc=*/nullptr, + /*RemoveDeadConstants=*/false, /*IncludeSelf=*/true); + } + StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext()); for (Function &F : M.functions()) { bool InterfaceChange = hasFatPointerInterface(F, &StructTM); @@ -1939,7 +1791,7 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { SmallVector<Function *> Intrinsics; // Keep one big map so as to memoize constants across functions. ValueToValueMapTy CloneMap; - FatPtrConstMaterializer Materializer(&StructTM, CloneMap, &IntTM, DL); + FatPtrConstMaterializer Materializer(&StructTM, CloneMap); ValueMapper LowerInFuncs(CloneMap, RF_None, &StructTM, &Materializer); for (auto [F, InterfaceChange] : NeedsRemap) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp index 6ec4178053b2..11f0cba47afd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp @@ -17,6 +17,157 @@ using namespace llvm; +void AMDGPUMIRFormatter::printImm(raw_ostream &OS, const MachineInstr &MI, + std::optional<unsigned int> OpIdx, int64_t Imm) const { + + switch (MI.getOpcode()) { + case AMDGPU::S_DELAY_ALU: + assert(OpIdx == 0); + printSDelayAluImm(Imm, OS); + break; + default: + MIRFormatter::printImm(OS, MI, OpIdx, Imm); + break; + } +} + +/// Implement target specific parsing of immediate mnemonics. The mnemonic is +/// a string with a leading dot. +bool AMDGPUMIRFormatter::parseImmMnemonic(const unsigned OpCode, + const unsigned OpIdx, + StringRef Src, int64_t &Imm, + ErrorCallbackType ErrorCallback) const +{ + + switch (OpCode) { + case AMDGPU::S_DELAY_ALU: + return parseSDelayAluImmMnemonic(OpIdx, Imm, Src, ErrorCallback); + default: + break; + } + return true; // Don't know what this is +} + +void AMDGPUMIRFormatter::printSDelayAluImm(int64_t Imm, + llvm::raw_ostream &OS) const { + // Construct an immediate string to represent the information encoded in the + // s_delay_alu immediate. + // .id0_<dep>[_skip_<count>_id1<dep>] + constexpr int64_t None = 0; + constexpr int64_t Same = 0; + + uint64_t Id0 = (Imm & 0xF); + uint64_t Skip = ((Imm >> 4) & 0x7); + uint64_t Id1 = ((Imm >> 7) & 0xF); + auto Outdep = [&](uint64_t Id) { + if (Id == None) + OS << "NONE"; + else if (Id < 5) + OS << "VALU_DEP_" << Id; + else if (Id < 8) + OS << "TRANS32_DEP_" << Id - 4; + else + OS << "SALU_CYCLE_" << Id - 8; + }; + + OS << ".id0_"; + Outdep(Id0); + + // If the second inst is "same" and "none", no need to print the rest of the + // string. + if (Skip == Same && Id1 == None) + return; + + // Encode the second delay specification. + OS << "_skip_"; + if (Skip == 0) + OS << "SAME"; + else if (Skip == 1) + OS << "NEXT"; + else + OS << "SKIP_" << Skip - 1; + + OS << "_id1_"; + Outdep(Id1); +} + +bool AMDGPUMIRFormatter::parseSDelayAluImmMnemonic( + const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src, + llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const +{ + assert(OpIdx == 0); + + Imm = 0; + bool Expected = Src.consume_front(".id0_"); + if (!Expected) + return ErrorCallback(Src.begin(), "Expected .id0_"); + + auto ExpectInt = [&](StringRef &Src, int64_t Offset) -> int64_t { + int64_t Dep; + if (!Src.consumeInteger(10, Dep)) + return Dep + Offset; + + return -1; + }; + + auto DecodeDelay = [&](StringRef &Src) -> int64_t { + if (Src.consume_front("NONE")) + return 0; + if (Src.consume_front("VALU_DEP_")) + return ExpectInt(Src, 0); + if (Src.consume_front("TRANS32_DEP_")) + return ExpectInt(Src, 4); + if (Src.consume_front("SALU_CYCLE_")) + return ExpectInt(Src, 8); + + return -1; + }; + + int64_t Delay0 = DecodeDelay(Src); + int64_t Skip = 0; + int64_t Delay1 = 0; + if (Delay0 == -1) + return ErrorCallback(Src.begin(), "Could not decode delay0"); + + + // Set the Imm so far, to that early return has the correct value. + Imm = Delay0; + + // If that was the end of the string, the second instruction is "same" and + // "none" + if (Src.begin() == Src.end()) + return false; + + Expected = Src.consume_front("_skip_"); + if (!Expected) + return ErrorCallback(Src.begin(), "Expected _skip_"); + + + if (Src.consume_front("SAME")) { + Skip = 0; + } else if (Src.consume_front("NEXT")) { + Skip = 1; + } else if (Src.consume_front("SKIP_")) { + if (Src.consumeInteger(10, Skip)) { + return ErrorCallback(Src.begin(), "Expected integer Skip value"); + } + Skip += 1; + } else { + ErrorCallback(Src.begin(), "Unexpected Skip Value"); + } + + Expected = Src.consume_front("_id1_"); + if (!Expected) + return ErrorCallback(Src.begin(), "Expected _id1_"); + + Delay1 = DecodeDelay(Src); + if (Delay1 == -1) + return ErrorCallback(Src.begin(), "Could not decode delay1"); + + Imm = Imm | (Skip << 4) | (Delay1 << 7); + return false; +} + bool AMDGPUMIRFormatter::parseCustomPseudoSourceValue( StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS, const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h index 98b5031071cf..c5c947375252 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h @@ -28,12 +28,35 @@ public: AMDGPUMIRFormatter() = default; virtual ~AMDGPUMIRFormatter() = default; + /// Implement target specific printing for machine operand immediate value, so + /// that we can have more meaningful mnemonic than a 64-bit integer. Passing + /// None to OpIdx means the index is unknown. + virtual void printImm(raw_ostream &OS, const MachineInstr &MI, + std::optional<unsigned> OpIdx, + int64_t Imm) const override; + + /// Implement target specific parsing of immediate mnemonics. The mnemonic is + /// a string with a leading dot. + virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx, + StringRef Src, int64_t &Imm, + ErrorCallbackType ErrorCallback) const override; + /// Implement target specific parsing of target custom pseudo source value. bool parseCustomPseudoSourceValue(StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS, const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const override; + +private: + /// Print the string to represent s_delay_alu immediate value + void printSDelayAluImm(int64_t Imm, llvm::raw_ostream &OS) const; + + /// Parse the immediate pseudo literal for s_delay_alu + bool parseSDelayAluImmMnemonic( + const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src, + llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const; + }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index f36374b08b34..cfe9f33efc91 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -100,7 +100,7 @@ public: bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const; // Combine unsigned buffer load and signed extension instructions to generate - // signed buffer laod instructions. + // signed buffer load instructions. bool matchCombineSignExtendInReg( MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const; void applyCombineSignExtendInReg( @@ -465,8 +465,8 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<GISelKnownBitsAnalysis>(); AU.addPreserved<GISelKnownBitsAnalysis>(); if (!IsOptNone) { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); } MachineFunctionPass::getAnalysisUsage(AU); } @@ -494,7 +494,8 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + IsOptNone ? nullptr + : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index 3f01a328afaf..4d0cb467ba37 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -238,8 +238,8 @@ void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<GISelKnownBitsAnalysis>(); AU.addPreserved<GISelKnownBitsAnalysis>(); if (!IsOptNone) { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); } AU.addRequired<GISelCSEAnalysisWrapperPass>(); @@ -272,7 +272,8 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &STI = MF.getSubtarget<GCNSubtarget>(); MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + IsOptNone ? nullptr + : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 35abd6eddde8..74f0540239c9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -421,8 +421,8 @@ void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<GISelKnownBitsAnalysis>(); AU.addPreserved<GISelKnownBitsAnalysis>(); if (!IsOptNone) { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); } MachineFunctionPass::getAnalysisUsage(AU); } @@ -449,7 +449,8 @@ bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) { const auto *LI = ST.getLegalizerInfo(); MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + IsOptNone ? nullptr + : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp index 2ea03ddb1fcc..d1985f46b1c4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp @@ -33,7 +33,7 @@ StringRef AMDGPURegBankSelect::getPassName() const { void AMDGPURegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<MachineCycleInfoWrapperPass>(); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); // TODO: Preserve DomTree RegBankSelect::getAnalysisUsage(AU); } @@ -41,7 +41,7 @@ void AMDGPURegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const { INITIALIZE_PASS_BEGIN(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE, "AMDGPU Register Bank Select", false, false) INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE, "AMDGPU Register Bank Select", false, false) @@ -63,7 +63,8 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); MachineCycleInfo &CycleInfo = getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo(); - MachineDominatorTree &DomTree = getAnalysis<MachineDominatorTree>(); + MachineDominatorTree &DomTree = + getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); MachineUniformityInfo Uniformity = computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(), diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 7ebd674757fb..9e7694f41d6b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3079,7 +3079,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { applyDefaultMapping(OpdMapper); @@ -4376,7 +4375,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { // vdata_out @@ -4907,8 +4905,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_global_load_tr_b128: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: - case Intrinsic::amdgcn_ds_ordered_swap: - case Intrinsic::amdgcn_ds_fadd_v2bf16: { + case Intrinsic::amdgcn_ds_ordered_swap: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, @@ -5221,11 +5218,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_ATOMICRMW_UMAX: case AMDGPU::G_ATOMICRMW_UMIN: case AMDGPU::G_ATOMICRMW_FADD: + case AMDGPU::G_ATOMICRMW_FMIN: + case AMDGPU::G_ATOMICRMW_FMAX: case AMDGPU::G_ATOMICRMW_UINC_WRAP: case AMDGPU::G_ATOMICRMW_UDEC_WRAP: - case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: - case AMDGPU::G_AMDGPU_ATOMIC_FMIN: - case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { + case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: { OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 410dc83d45c5..ed5bae3e4ff6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -252,21 +252,8 @@ def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>; def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>; -def : SourceOfDivergence<int_amdgcn_ds_fadd>; def : SourceOfDivergence<int_amdgcn_ds_fmin>; def : SourceOfDivergence<int_amdgcn_ds_fmax>; -def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_smin>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_umin>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_smax>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_umax>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_and>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_or>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_xor>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>; @@ -280,7 +267,6 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>; -def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>; @@ -298,7 +284,6 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>; -def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>; @@ -316,7 +301,6 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>; -def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>; @@ -334,12 +318,10 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>; -def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>; def : SourceOfDivergence<int_amdgcn_ps_live>; def : SourceOfDivergence<int_amdgcn_live_mask>; def : SourceOfDivergence<int_amdgcn_ds_swizzle>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index 2449fa581842..3e5d83b8e3fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -15,10 +15,9 @@ /// SplitModule: load-balance the module's functions across a set of N /// partitions to allow parallel codegen. However, it does it very /// differently than the target-agnostic variant: -/// - Kernels are used as the module's "roots". -/// They're known entry points on AMDGPU, and everything else is often -/// internal only. -/// - Each kernel has a set of dependencies, and when a kernel and its +/// - The module has "split roots", which are kernels in the vast +// majority of cases. +/// - Each root has a set of dependencies, and when a root and its /// dependencies is considered "big", we try to put it in a partition where /// most dependencies are already imported, to avoid duplicating large /// amounts of code. @@ -67,20 +66,22 @@ using namespace llvm; namespace { -static cl::opt<float> LargeKernelFactor( - "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f), +static cl::opt<float> LargeFnFactor( + "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f), cl::Hidden, cl::desc( - "consider a kernel as large and needing special treatment when it " + "consider a function as large and needing special treatment when the " + "cost of importing it into a partition" "exceeds the average cost of a partition by this factor; e;g. 2.0 " - "means if the kernel and its dependencies is 2 times bigger than " - "an average partition; 0 disables large kernels handling entirely")); + "means if the function and its dependencies is 2 times bigger than " + "an average partition; 0 disables large functions handling entirely")); -static cl::opt<float> LargeKernelOverlapForMerge( - "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f), +static cl::opt<float> LargeFnOverlapForMerge( + "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f), cl::Hidden, - cl::desc("defines how much overlap between two large kernel's dependencies " - "is needed to put them in the same partition")); + cl::desc( + "defines how much overlap between two large function's dependencies " + "is needed to put them in the same partition")); static cl::opt<bool> NoExternalizeGlobals( "amdgpu-module-splitting-no-externalize-globals", cl::Hidden, @@ -98,6 +99,7 @@ static cl::opt<bool> using CostType = InstructionCost::CostType; using PartitionID = unsigned; +using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>; static bool isEntryPoint(const Function *F) { return AMDGPU::isEntryFunctionCC(F->getCallingConv()); @@ -214,13 +216,12 @@ static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) { /// Calculate the cost of each function in \p M /// \param SML Log Helper -/// \param TM TargetMachine instance used to retrieve TargetTransformInfo. +/// \param GetTTI Abstract getter for TargetTransformInfo. /// \param M Module to analyze. /// \param CostMap[out] Resulting Function -> Cost map. /// \return The module's total cost. static CostType -calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM, - Module &M, +calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M, DenseMap<const Function *, CostType> &CostMap) { CostType ModuleCost = 0; CostType KernelCost = 0; @@ -230,8 +231,7 @@ calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM, continue; CostType FnCost = 0; - TargetTransformInfo TTI = TM.getTargetTransformInfo(Fn); - + const auto &TTI = GetTTI(Fn); for (const auto &BB : Fn) { for (const auto &I : BB) { auto Cost = @@ -277,9 +277,9 @@ static bool canBeIndirectlyCalled(const Function &F) { /*IgnoreCastedDirectCall=*/true); } -/// When a kernel or any of its callees performs an indirect call, this function +/// When a function or any of its callees performs an indirect call, this /// takes over \ref addAllDependencies and adds all potentially callable -/// functions to \p Fns so they can be counted as dependencies of the kernel. +/// functions to \p Fns so they can be counted as dependencies of the function. /// /// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the /// presence of an indirect call, the function's resource usage is the same as @@ -301,13 +301,14 @@ static void addAllIndirectCallDependencies(const Module &M, /// \param CG Call graph for \p Fn's module. /// \param Fn Current function to look at. /// \param Fns[out] Resulting list of functions. +/// \param OnlyDirect Whether to only consider direct callees. /// \param HadIndirectCall[out] Set to true if an indirect call was seen at some /// point, either in \p Fn or in one of the function it calls. When that /// happens, we fall back to adding all callable functions inside \p Fn's module /// to \p Fns. static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, const Function &Fn, - DenseSet<const Function *> &Fns, + DenseSet<const Function *> &Fns, bool OnlyDirect, bool &HadIndirectCall) { assert(!Fn.isDeclaration()); @@ -325,6 +326,9 @@ static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, auto *CGNode = CGEntry.second; auto *Callee = CGNode->getFunction(); if (!Callee) { + if (OnlyDirect) + continue; + // Functions have an edge towards CallsExternalNode if they're external // declarations, or if they do an indirect call. As we only process // definitions here, we know this means the function has an indirect @@ -353,13 +357,19 @@ static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, } } -/// Contains information about a kernel and its dependencies. -struct KernelWithDependencies { - KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG, - const DenseMap<const Function *, CostType> &FnCosts, - const Function *Fn) +/// Contains information about a function and its dependencies. +/// This is a splitting root. The splitting algorithm works by +/// assigning these to partitions. +struct FunctionWithDependencies { + FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG, + const DenseMap<const Function *, CostType> &FnCosts, + const Function *Fn) : Fn(Fn) { - addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall); + // When Fn is not a kernel, we don't need to collect indirect callees. + // Resource usage analysis is only performed on kernels, and we collect + // indirect callees for resource usage analysis. + addAllDependencies(SML, CG, *Fn, Dependencies, + /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall); TotalCost = FnCosts.at(Fn); for (const auto *Dep : Dependencies) { TotalCost += FnCosts.at(Dep); @@ -380,8 +390,8 @@ struct KernelWithDependencies { CostType TotalCost = 0; - /// \returns true if this kernel and its dependencies can be considered large - /// according to \p Threshold. + /// \returns true if this function and its dependencies can be considered + /// large according to \p Threshold. bool isLarge(CostType Threshold) const { return TotalCost > Threshold && !Dependencies.empty(); } @@ -420,39 +430,39 @@ static float calculateOverlap(const DenseSet<const Function *> &A, /// \param NumParts Number of partitions to create. /// \param ModuleCost Total cost of all functions in \p M. /// \param FnCosts Map of Function -> Cost -/// \param WorkList Kernels and their dependencies to process in order. +/// \param WorkList Functions and their dependencies to process in order. /// \returns The created partitions (a vector of size \p NumParts ) static std::vector<DenseSet<const Function *>> doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, CostType ModuleCost, const DenseMap<const Function *, CostType> &FnCosts, - const SmallVector<KernelWithDependencies> &WorkList) { + const SmallVector<FunctionWithDependencies> &WorkList) { SML << "\n--Partitioning Starts--\n"; - // Calculate a "large kernel threshold". When more than one kernel's total - // import cost exceeds this value, we will try to merge it with other, - // similarly large kernels. + // Calculate a "large function threshold". When more than one function's total + // import cost exceeds this value, we will try to assign it to an existing + // partition to reduce the amount of duplication needed. // - // e.g. let two kernels X and Y have a import cost of ~10% of the module, we + // e.g. let two functions X and Y have a import cost of ~10% of the module, we // assign X to a partition as usual, but when we get to Y, we check if it's // worth also putting it in Y's partition. - const CostType LargeKernelThreshold = - LargeKernelFactor ? CostType(((ModuleCost / NumParts) * LargeKernelFactor)) - : std::numeric_limits<CostType>::max(); + const CostType LargeFnThreshold = + LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor)) + : std::numeric_limits<CostType>::max(); std::vector<DenseSet<const Function *>> Partitions; Partitions.resize(NumParts); - // Assign a partition to each kernel, and try to keep the partitions more or + // Assign functions to partitions, and try to keep the partitions more or // less balanced. We do that through a priority queue sorted in reverse, so we // can always look at the partition with the least content. // // There are some cases where we will be deliberately unbalanced though. - // - Large kernels: we try to merge with existing partitions to reduce code + // - Large functions: we try to merge with existing partitions to reduce code // duplication. - // - Kernels with indirect or external calls always go in the first partition - // (P0). + // - Functions with indirect or external calls always go in the first + // partition (P0). auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a, const std::pair<PartitionID, CostType> &b) { // When two partitions have the same cost, assign to the one with the @@ -471,17 +481,17 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, for (unsigned I = 0; I < NumParts; ++I) BalancingQueue.push_back(std::make_pair(I, 0)); - // Helper function to handle assigning a kernel to a partition. This takes + // Helper function to handle assigning a function to a partition. This takes // care of updating the balancing queue. const auto AssignToPartition = [&](PartitionID PID, - const KernelWithDependencies &KWD) { + const FunctionWithDependencies &FWD) { auto &FnsInPart = Partitions[PID]; - FnsInPart.insert(KWD.Fn); - FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end()); + FnsInPart.insert(FWD.Fn); + FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end()); - SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n -> "; - if (!KWD.Dependencies.empty()) { - SML << KWD.Dependencies.size() << " dependencies added\n"; + SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n -> "; + if (!FWD.Dependencies.empty()) { + SML << FWD.Dependencies.size() << " dependencies added\n"; }; // Update the balancing queue. we scan backwards because in the common case @@ -506,44 +516,43 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, sort(BalancingQueue, ComparePartitions); }; - for (auto &CurKernel : WorkList) { - // When a kernel has indirect calls, it must stay in the first partition + for (auto &CurFn : WorkList) { + // When a function has indirect calls, it must stay in the first partition // alongside every reachable non-entry function. This is a nightmare case // for splitting as it severely limits what we can do. - if (CurKernel.HasIndirectCall) { - SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn) + if (CurFn.HasIndirectCall) { + SML << "Function with indirect call(s): " << getName(*CurFn.Fn) << " defaulting to P0\n"; - AssignToPartition(0, CurKernel); + AssignToPartition(0, CurFn); continue; } - // When a kernel has non duplicatable dependencies, we have to keep it in + // When a function has non duplicatable dependencies, we have to keep it in // the first partition as well. This is a conservative approach, a // finer-grained approach could keep track of which dependencies are // non-duplicatable exactly and just make sure they're grouped together. - if (CurKernel.HasNonDuplicatableDependecy) { - SML << "Kernel with externally visible dependency " - << getName(*CurKernel.Fn) << " defaulting to P0\n"; - AssignToPartition(0, CurKernel); + if (CurFn.HasNonDuplicatableDependecy) { + SML << "Function with externally visible dependency " + << getName(*CurFn.Fn) << " defaulting to P0\n"; + AssignToPartition(0, CurFn); continue; } - // Be smart with large kernels to avoid duplicating their dependencies. - if (CurKernel.isLarge(LargeKernelThreshold)) { - assert(LargeKernelOverlapForMerge >= 0.0f && - LargeKernelOverlapForMerge <= 1.0f); - SML << "Large Kernel: " << getName(*CurKernel.Fn) + // Be smart with large functions to avoid duplicating their dependencies. + if (CurFn.isLarge(LargeFnThreshold)) { + assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f); + SML << "Large Function: " << getName(*CurFn.Fn) << " - looking for partition with at least " - << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n"; + << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n"; bool Assigned = false; for (const auto &[PID, Fns] : enumerate(Partitions)) { - float Overlap = calculateOverlap(CurKernel.Dependencies, Fns); + float Overlap = calculateOverlap(CurFn.Dependencies, Fns); SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P" << PID << '\n'; - if (Overlap > LargeKernelOverlapForMerge) { + if (Overlap > LargeFnOverlapForMerge) { SML << " selecting P" << PID << '\n'; - AssignToPartition(PID, CurKernel); + AssignToPartition(PID, CurFn); Assigned = true; } } @@ -554,41 +563,34 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, // Normal "load-balancing", assign to partition with least pressure. auto [PID, CurCost] = BalancingQueue.back(); - AssignToPartition(PID, CurKernel); + AssignToPartition(PID, CurFn); } - // Work is mostly done now, verify the partioning and add all functions we may - // have missed (= unreachable, or we don't understand how they're reached) to - // P0. - DenseSet<const Function *> AllFunctions; - for (const auto &[Idx, Part] : enumerate(Partitions)) { - CostType Cost = 0; - for (auto *Fn : Part) { - // external linkage functions should exclusively be in the first partition - // at this stage. In theory, we should only ever see external linkage - // functions here if they're kernels, or if they've been added due to a - // kernel using indirect calls somewhere in its CallGraph. - assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn))); - Cost += FnCosts.at(Fn); + if (SML) { + for (const auto &[Idx, Part] : enumerate(Partitions)) { + CostType Cost = 0; + for (auto *Fn : Part) + Cost += FnCosts.at(Fn); + SML << "P" << Idx << " has a total cost of " << Cost << " (" + << format("%0.2f", (float(Cost) / ModuleCost) * 100) + << "% of source module)\n"; } - SML << "P" << Idx << " has a total cost of " << Cost << " (" - << format("%0.2f", (float(Cost) / ModuleCost) * 100) - << "% of source module)\n"; - AllFunctions.insert(Part.begin(), Part.end()); + + SML << "--Partitioning Done--\n\n"; } - // Add missed functions to P0. This will take care of adding things like - // external functions with no callers in the module to P0. This should be - // fairly rare as AMDGPU internalizes everything in most cases, so unused - // internal functions would get removed. + // Check no functions were missed. +#ifndef NDEBUG + DenseSet<const Function *> AllFunctions; + for (const auto &Part : Partitions) + AllFunctions.insert(Part.begin(), Part.end()); + for (auto &Fn : M) { if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) { - SML << getName(Fn) << " has no partition assigned, defaulting to P0\n"; - Partitions[0].insert(&Fn); + assert(AllFunctions.contains(&Fn) && "Missed a function?!"); } } - - SML << "--Partitioning Done--\n\n"; +#endif return Partitions; } @@ -604,10 +606,17 @@ static void externalize(GlobalValue &GV) { if (!GV.hasName()) GV.setName("__llvmsplit_unnamed"); } -} // end anonymous namespace -void llvm::splitAMDGPUModule( - const AMDGPUTargetMachine &TM, Module &M, unsigned N, +static bool hasDirectCaller(const Function &Fn) { + for (auto &U : Fn.uses()) { + if (auto *CB = dyn_cast<CallBase>(U.getUser()); CB && CB->isCallee(&U)) + return true; + } + return false; +} + +static void splitAMDGPUModule( + GetTTIFn GetTTI, Module &M, unsigned N, function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) { SplitModuleLogger SML(M); @@ -648,15 +657,36 @@ void llvm::splitAMDGPUModule( // Start by calculating the cost of every function in the module, as well as // the module's overall cost. DenseMap<const Function *, CostType> FnCosts; - const CostType ModuleCost = calculateFunctionCosts(SML, TM, M, FnCosts); + const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts); - // Gather every kernel into a WorkList, then sort it by descending total cost - // of the kernel so the biggest kernels are seen first. - SmallVector<KernelWithDependencies> WorkList; + // First, gather ever kernel into the worklist. + SmallVector<FunctionWithDependencies> WorkList; for (auto &Fn : M) { if (isEntryPoint(&Fn) && !Fn.isDeclaration()) WorkList.emplace_back(SML, CG, FnCosts, &Fn); } + + // Then, find missing functions that need to be considered as additional + // roots. These can't be called in theory, but in practice we still have to + // handle them to avoid linker errors. + { + DenseSet<const Function *> SeenFunctions; + for (const auto &FWD : WorkList) { + SeenFunctions.insert(FWD.Fn); + SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end()); + } + + for (auto &Fn : M) { + // If this function is not part of any kernel's dependencies and isn't + // directly called, consider it as a root. + if (!Fn.isDeclaration() && !isEntryPoint(&Fn) && + !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) { + WorkList.emplace_back(SML, CG, FnCosts, &Fn); + } + } + } + + // Sort the worklist so the most expensive roots are seen first. sort(WorkList, [&](auto &A, auto &B) { // Sort by total cost, and if the total cost is identical, sort // alphabetically. @@ -667,13 +697,20 @@ void llvm::splitAMDGPUModule( if (SML) { SML << "Worklist\n"; - for (const auto &KWD : WorkList) { - SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost - << " indirect:" << KWD.HasIndirectCall - << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy + for (const auto &FWD : WorkList) { + SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost + << " indirect:" << FWD.HasIndirectCall + << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy << ")\n"; - for (const auto *Dep : KWD.Dependencies) - SML << " [Dep] " << getName(*Dep) << '\n'; + // Sort function names before printing to ensure determinism. + SmallVector<std::string> SortedDepNames; + SortedDepNames.reserve(FWD.Dependencies.size()); + for (const auto *Dep : FWD.Dependencies) + SortedDepNames.push_back(getName(*Dep)); + sort(SortedDepNames); + + for (const auto &Name : SortedDepNames) + SML << " [dependency] " << Name << '\n'; } } @@ -700,16 +737,8 @@ void llvm::splitAMDGPUModule( std::unique_ptr<Module> MPart( CloneModule(M, VMap, [&](const GlobalValue *GV) { // Functions go in their assigned partition. - if (const auto *Fn = dyn_cast<Function>(GV)) { -// Check we don't import an external linkage function in any -// partition other than P0. -#ifndef NDEBUG - if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) { - assert((I == 0) == FnsInPart.contains(Fn)); - } -#endif + if (const auto *Fn = dyn_cast<Function>(GV)) return FnsInPart.contains(Fn); - } if (NeedsConservativeImport(GV)) return true; @@ -742,3 +771,16 @@ void llvm::splitAMDGPUModule( << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100) << "% of original module)\n"; } +} // namespace + +PreservedAnalyses AMDGPUSplitModulePass::run(Module &M, + ModuleAnalysisManager &MAM) { + FunctionAnalysisManager &FAM = + MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & { + return FAM.getResult<TargetIRAnalysis>(F); + }; + splitAMDGPUModule(TTIGetter, M, N, ModuleCallback); + // We don't change the original module. + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h index 6171643bd4ad..d814dedd6f0c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h @@ -12,18 +12,27 @@ #define LLVM_TARGET_AMDGPUSPLITMODULE_H #include "llvm/ADT/STLFunctionalExtras.h" +#include "llvm/IR/PassManager.h" #include <memory> namespace llvm { -class Module; -class AMDGPUTargetMachine; - /// Splits the module M into N linkable partitions. The function ModuleCallback /// is called N times passing each individual partition as the MPart argument. -void splitAMDGPUModule( - const AMDGPUTargetMachine &TM, Module &M, unsigned N, - function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback); +class AMDGPUSplitModulePass : public PassInfoMixin<AMDGPUSplitModulePass> { +public: + using ModuleCreationCallback = + function_ref<void(std::unique_ptr<Module> MPart)>; + + AMDGPUSplitModulePass(unsigned N, ModuleCreationCallback ModuleCallback) + : N(N), ModuleCallback(ModuleCallback) {} + + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); + +private: + unsigned N; + ModuleCreationCallback ModuleCallback; +}; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 0751c8dc8b8b..a8e26f104f58 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -1104,6 +1104,9 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, if (hasFlatScratchInit()) NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); + + if (hasPrivateSegmentSize()) + NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID); } void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index ce997c659094..9162e110aa10 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -658,8 +658,7 @@ Error AMDGPUTargetMachine::buildCodeGenPipeline( return CGPB.buildPipeline(MPM, Out, DwoOut, FileType); } -void AMDGPUTargetMachine::registerPassBuilderCallbacks( - PassBuilder &PB, bool PopulateClassToPassNames) { +void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def" #include "llvm/Passes/TargetPassRegistry.inc" @@ -829,8 +828,24 @@ AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { bool AMDGPUTargetMachine::splitModule( Module &M, unsigned NumParts, - function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) const { - splitAMDGPUModule(*this, M, NumParts, ModuleCallback); + function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) { + // FIXME(?): Would be better to use an already existing Analysis/PassManager, + // but all current users of this API don't have one ready and would need to + // create one anyway. Let's hide the boilerplate for now to keep it simple. + + LoopAnalysisManager LAM; + FunctionAnalysisManager FAM; + CGSCCAnalysisManager CGAM; + ModuleAnalysisManager MAM; + + PassBuilder PB(this); + PB.registerModuleAnalyses(MAM); + PB.registerFunctionAnalyses(FAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + + ModulePassManager MPM; + MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback)); + MPM.run(M, MAM); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 2cfd232483a8..0f74fbc22fa8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -58,8 +58,7 @@ public: const CGPassBuilderOption &Opts, PassInstrumentationCallbacks *PIC) override; - void registerPassBuilderCallbacks(PassBuilder &PB, - bool PopulateClassToPassNames) override; + void registerPassBuilderCallbacks(PassBuilder &PB) override; void registerDefaultAliasAnalyses(AAManager &) override; /// Get the integer value of a null pointer in the given address space. @@ -76,7 +75,7 @@ public: bool splitModule(Module &M, unsigned NumParts, function_ref<void(std::unique_ptr<Module> MPart)> - ModuleCallback) const override; + ModuleCallback) override; }; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 437e01c37c6b..1192b49fd1f0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -502,7 +502,6 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, switch (Inst->getIntrinsicID()) { case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2)); @@ -1019,7 +1018,6 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, Intrinsic::ID IID) const { switch (IID) { - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: case Intrinsic::amdgcn_is_shared: @@ -1041,7 +1039,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *NewV) const { auto IntrID = II->getIntrinsicID(); switch (IntrID) { - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4)); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index bdb5a8d9a0a0..b08957d22ee7 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1314,6 +1314,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser { /// } private: + void createConstantSymbol(StringRef Id, int64_t Val); + bool ParseAsAbsoluteExpression(uint32_t &Ret); bool OutOfRangeError(SMRange Range); /// Calculate VGPR/SGPR blocks required for given target, reserved @@ -1331,12 +1333,12 @@ private: /// \param SGPRRange [in] Token range, used for SGPR diagnostics. /// \param VGPRBlocks [out] Result VGPR block count. /// \param SGPRBlocks [out] Result SGPR block count. - bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed, - bool FlatScrUsed, bool XNACKUsed, + bool calculateGPRBlocks(const FeatureBitset &Features, const MCExpr *VCCUsed, + const MCExpr *FlatScrUsed, bool XNACKUsed, std::optional<bool> EnableWavefrontSize32, - unsigned NextFreeVGPR, SMRange VGPRRange, - unsigned NextFreeSGPR, SMRange SGPRRange, - unsigned &VGPRBlocks, unsigned &SGPRBlocks); + const MCExpr *NextFreeVGPR, SMRange VGPRRange, + const MCExpr *NextFreeSGPR, SMRange SGPRRange, + const MCExpr *&VGPRBlocks, const MCExpr *&SGPRBlocks); bool ParseDirectiveAMDGCNTarget(); bool ParseDirectiveAMDHSACodeObjectVersion(); bool ParseDirectiveAMDHSAKernel(); @@ -1408,36 +1410,28 @@ public: setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits())); - { - // TODO: make those pre-defined variables read-only. - // Currently there is none suitable machinery in the core llvm-mc for this. - // MCSymbol::isRedefinable is intended for another purpose, and - // AsmParser::parseDirectiveSet() cannot be specialized for specific target. - AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); - MCContext &Ctx = getContext(); - if (ISA.Major >= 6 && isHsaAbi(getSTI())) { - MCSymbol *Sym = - Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); - Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_minor")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx)); - Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_stepping")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx)); - } else { - MCSymbol *Sym = - Ctx.getOrCreateSymbol(Twine(".option.machine_version_major")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); - Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx)); - Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx)); - } - if (ISA.Major >= 6 && isHsaAbi(getSTI())) { - initializeGprCountSymbol(IS_VGPR); - initializeGprCountSymbol(IS_SGPR); - } else - KernelScope.initialize(getContext()); + AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); + if (ISA.Major >= 6 && isHsaAbi(getSTI())) { + createConstantSymbol(".amdgcn.gfx_generation_number", ISA.Major); + createConstantSymbol(".amdgcn.gfx_generation_minor", ISA.Minor); + createConstantSymbol(".amdgcn.gfx_generation_stepping", ISA.Stepping); + } else { + createConstantSymbol(".option.machine_version_major", ISA.Major); + createConstantSymbol(".option.machine_version_minor", ISA.Minor); + createConstantSymbol(".option.machine_version_stepping", ISA.Stepping); } + if (ISA.Major >= 6 && isHsaAbi(getSTI())) { + initializeGprCountSymbol(IS_VGPR); + initializeGprCountSymbol(IS_SGPR); + } else + KernelScope.initialize(getContext()); + + for (auto [Symbol, Code] : AMDGPU::UCVersion::getGFXVersions()) + createConstantSymbol(Symbol, Code); + + createConstantSymbol("UC_VERSION_W64_BIT", 0x2000); + createConstantSymbol("UC_VERSION_W32_BIT", 0x4000); + createConstantSymbol("UC_VERSION_MDP_BIT", 0x8000); } bool hasMIMG_R128() const { @@ -2486,6 +2480,16 @@ bool AMDGPUOperand::isInlineValue() const { // AsmParser //===----------------------------------------------------------------------===// +void AMDGPUAsmParser::createConstantSymbol(StringRef Id, int64_t Val) { + // TODO: make those pre-defined variables read-only. + // Currently there is none suitable machinery in the core llvm-mc for this. + // MCSymbol::isRedefinable is intended for another purpose, and + // AsmParser::parseDirectiveSet() cannot be specialized for specific target. + MCContext &Ctx = getContext(); + MCSymbol *Sym = Ctx.getOrCreateSymbol(Id); + Sym->setVariableValue(MCConstantExpr::create(Val, Ctx)); +} + static int getRegClass(RegisterKind Is, unsigned RegWidth) { if (Is == IS_VGPR) { switch (RegWidth) { @@ -5352,41 +5356,64 @@ bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) { } bool AMDGPUAsmParser::calculateGPRBlocks( - const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed, - bool XNACKUsed, std::optional<bool> EnableWavefrontSize32, - unsigned NextFreeVGPR, SMRange VGPRRange, unsigned NextFreeSGPR, - SMRange SGPRRange, unsigned &VGPRBlocks, unsigned &SGPRBlocks) { + const FeatureBitset &Features, const MCExpr *VCCUsed, + const MCExpr *FlatScrUsed, bool XNACKUsed, + std::optional<bool> EnableWavefrontSize32, const MCExpr *NextFreeVGPR, + SMRange VGPRRange, const MCExpr *NextFreeSGPR, SMRange SGPRRange, + const MCExpr *&VGPRBlocks, const MCExpr *&SGPRBlocks) { // TODO(scott.linder): These calculations are duplicated from // AMDGPUAsmPrinter::getSIProgramInfo and could be unified. IsaVersion Version = getIsaVersion(getSTI().getCPU()); + MCContext &Ctx = getContext(); - unsigned NumVGPRs = NextFreeVGPR; - unsigned NumSGPRs = NextFreeSGPR; + const MCExpr *NumSGPRs = NextFreeSGPR; + int64_t EvaluatedSGPRs; if (Version.Major >= 10) - NumSGPRs = 0; + NumSGPRs = MCConstantExpr::create(0, Ctx); else { unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(&getSTI()); - if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) && - NumSGPRs > MaxAddressableNumSGPRs) + if (NumSGPRs->evaluateAsAbsolute(EvaluatedSGPRs) && Version.Major >= 8 && + !Features.test(FeatureSGPRInitBug) && + static_cast<uint64_t>(EvaluatedSGPRs) > MaxAddressableNumSGPRs) return OutOfRangeError(SGPRRange); - NumSGPRs += - IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed); + const MCExpr *ExtraSGPRs = + AMDGPUMCExpr::createExtraSGPRs(VCCUsed, FlatScrUsed, XNACKUsed, Ctx); + NumSGPRs = MCBinaryExpr::createAdd(NumSGPRs, ExtraSGPRs, Ctx); - if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) && - NumSGPRs > MaxAddressableNumSGPRs) + if (NumSGPRs->evaluateAsAbsolute(EvaluatedSGPRs) && + (Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) && + static_cast<uint64_t>(EvaluatedSGPRs) > MaxAddressableNumSGPRs) return OutOfRangeError(SGPRRange); if (Features.test(FeatureSGPRInitBug)) - NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; - } + NumSGPRs = + MCConstantExpr::create(IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG, Ctx); + } + + // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks: + // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1 + auto GetNumGPRBlocks = [&Ctx](const MCExpr *NumGPR, + unsigned Granule) -> const MCExpr * { + const MCExpr *OneConst = MCConstantExpr::create(1ul, Ctx); + const MCExpr *GranuleConst = MCConstantExpr::create(Granule, Ctx); + const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx); + const MCExpr *AlignToGPR = + AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx); + const MCExpr *DivGPR = + MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx); + const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx); + return SubGPR; + }; - VGPRBlocks = IsaInfo::getEncodedNumVGPRBlocks(&getSTI(), NumVGPRs, - EnableWavefrontSize32); - SGPRBlocks = IsaInfo::getNumSGPRBlocks(&getSTI(), NumSGPRs); + VGPRBlocks = GetNumGPRBlocks( + NextFreeVGPR, + IsaInfo::getVGPREncodingGranule(&getSTI(), EnableWavefrontSize32)); + SGPRBlocks = + GetNumGPRBlocks(NumSGPRs, IsaInfo::getSGPREncodingGranule(&getSTI())); return false; } @@ -5410,14 +5437,17 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { IsaVersion IVersion = getIsaVersion(getSTI().getCPU()); + const MCExpr *ZeroExpr = MCConstantExpr::create(0, getContext()); + const MCExpr *OneExpr = MCConstantExpr::create(1, getContext()); + SMRange VGPRRange; - uint64_t NextFreeVGPR = 0; - uint64_t AccumOffset = 0; + const MCExpr *NextFreeVGPR = ZeroExpr; + const MCExpr *AccumOffset = MCConstantExpr::create(0, getContext()); uint64_t SharedVGPRCount = 0; uint64_t PreloadLength = 0; uint64_t PreloadOffset = 0; SMRange SGPRRange; - uint64_t NextFreeSGPR = 0; + const MCExpr *NextFreeSGPR = ZeroExpr; // Count the number of user SGPRs implied from the enabled feature bits. unsigned ImpliedUserSGPRCount = 0; @@ -5425,8 +5455,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { // Track if the asm explicitly contains the directive for the user SGPR // count. std::optional<unsigned> ExplicitUserSGPRCount; - bool ReserveVCC = true; - bool ReserveFlatScr = true; + const MCExpr *ReserveVCC = OneExpr; + const MCExpr *ReserveFlatScr = OneExpr; std::optional<bool> EnableWavefrontSize32; while (true) { @@ -5620,34 +5650,29 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID, ExprVal, ValRange); } else if (ID == ".amdhsa_next_free_vgpr") { - EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); VGPRRange = ValRange; - NextFreeVGPR = Val; + NextFreeVGPR = ExprVal; } else if (ID == ".amdhsa_next_free_sgpr") { - EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); SGPRRange = ValRange; - NextFreeSGPR = Val; + NextFreeSGPR = ExprVal; } else if (ID == ".amdhsa_accum_offset") { if (!isGFX90A()) return Error(IDRange.Start, "directive requires gfx90a+", IDRange); - EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); - AccumOffset = Val; + AccumOffset = ExprVal; } else if (ID == ".amdhsa_reserve_vcc") { - EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); - if (!isUInt<1>(Val)) + if (EvaluatableExpr && !isUInt<1>(Val)) return OutOfRangeError(ValRange); - ReserveVCC = Val; + ReserveVCC = ExprVal; } else if (ID == ".amdhsa_reserve_flat_scratch") { - EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); if (IVersion.Major < 7) return Error(IDRange.Start, "directive requires gfx7+", IDRange); if (hasArchitectedFlatScratch()) return Error(IDRange.Start, "directive is not supported with architected flat scratch", IDRange); - if (!isUInt<1>(Val)) + if (EvaluatableExpr && !isUInt<1>(Val)) return OutOfRangeError(ValRange); - ReserveFlatScr = Val; + ReserveFlatScr = ExprVal; } else if (ID == ".amdhsa_reserve_xnack_mask") { if (IVersion.Major < 8) return Error(IDRange.Start, "directive requires gfx8+", IDRange); @@ -5771,8 +5796,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { if (!Seen.contains(".amdhsa_next_free_sgpr")) return TokError(".amdhsa_next_free_sgpr directive is required"); - unsigned VGPRBlocks; - unsigned SGPRBlocks; + const MCExpr *VGPRBlocks; + const MCExpr *SGPRBlocks; if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr, getTargetStreamer().getTargetID()->isXnackOnOrAny(), EnableWavefrontSize32, NextFreeVGPR, @@ -5780,19 +5805,26 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { SGPRBlocks)) return true; - if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>( - VGPRBlocks)) + int64_t EvaluatedVGPRBlocks; + bool VGPRBlocksEvaluatable = + VGPRBlocks->evaluateAsAbsolute(EvaluatedVGPRBlocks); + if (VGPRBlocksEvaluatable && + !isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>( + static_cast<uint64_t>(EvaluatedVGPRBlocks))) { return OutOfRangeError(VGPRRange); + } AMDGPU::MCKernelDescriptor::bits_set( - KD.compute_pgm_rsrc1, MCConstantExpr::create(VGPRBlocks, getContext()), + KD.compute_pgm_rsrc1, VGPRBlocks, COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT, COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, getContext()); - if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>( - SGPRBlocks)) + int64_t EvaluatedSGPRBlocks; + if (SGPRBlocks->evaluateAsAbsolute(EvaluatedSGPRBlocks) && + !isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>( + static_cast<uint64_t>(EvaluatedSGPRBlocks))) return OutOfRangeError(SGPRRange); AMDGPU::MCKernelDescriptor::bits_set( - KD.compute_pgm_rsrc1, MCConstantExpr::create(SGPRBlocks, getContext()), + KD.compute_pgm_rsrc1, SGPRBlocks, COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT, COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT, getContext()); @@ -5822,16 +5854,28 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { if (isGFX90A()) { if (!Seen.contains(".amdhsa_accum_offset")) return TokError(".amdhsa_accum_offset directive is required"); - if (AccumOffset < 4 || AccumOffset > 256 || (AccumOffset & 3)) + int64_t EvaluatedAccum; + bool AccumEvaluatable = AccumOffset->evaluateAsAbsolute(EvaluatedAccum); + uint64_t UEvaluatedAccum = EvaluatedAccum; + if (AccumEvaluatable && + (UEvaluatedAccum < 4 || UEvaluatedAccum > 256 || (UEvaluatedAccum & 3))) return TokError("accum_offset should be in range [4..256] in " "increments of 4"); - if (AccumOffset > alignTo(std::max((uint64_t)1, NextFreeVGPR), 4)) + + int64_t EvaluatedNumVGPR; + if (NextFreeVGPR->evaluateAsAbsolute(EvaluatedNumVGPR) && + AccumEvaluatable && + UEvaluatedAccum > + alignTo(std::max((uint64_t)1, (uint64_t)EvaluatedNumVGPR), 4)) return TokError("accum_offset exceeds total VGPR allocation"); - MCKernelDescriptor::bits_set( - KD.compute_pgm_rsrc3, - MCConstantExpr::create(AccumOffset / 4 - 1, getContext()), - COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, - COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, getContext()); + const MCExpr *AdjustedAccum = MCBinaryExpr::createSub( + MCBinaryExpr::createDiv( + AccumOffset, MCConstantExpr::create(4, getContext()), getContext()), + MCConstantExpr::create(1, getContext()), getContext()); + MCKernelDescriptor::bits_set(KD.compute_pgm_rsrc3, AdjustedAccum, + COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, + COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, + getContext()); } if (IVersion.Major >= 10 && IVersion.Major < 12) { @@ -5840,7 +5884,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { return TokError("shared_vgpr_count directive not valid on " "wavefront size 32"); } - if (SharedVGPRCount * 2 + VGPRBlocks > 63) { + + if (VGPRBlocksEvaluatable && + (SharedVGPRCount * 2 + static_cast<uint64_t>(EvaluatedVGPRBlocks) > + 63)) { return TokError("shared_vgpr_count*2 + " "compute_pgm_rsrc1.GRANULATED_WORKITEM_VGPR_COUNT cannot " "exceed 63\n"); @@ -8353,7 +8400,7 @@ void AMDGPUAsmParser::onBeginOfFile() { /// max(expr, ...) /// bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { - using AGVK = AMDGPUVariadicMCExpr::VariadicKind; + using AGVK = AMDGPUMCExpr::VariantKind; if (isToken(AsmToken::Identifier)) { StringRef TokenId = getTokenStr(); @@ -8383,7 +8430,7 @@ bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { "mismatch of commas in " + Twine(TokenId) + " expression"); return true; } - Res = AMDGPUVariadicMCExpr::create(VK, Exprs, getContext()); + Res = AMDGPUMCExpr::create(VK, Exprs, getContext()); return false; } const MCExpr *Expr; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index b05834e5803a..3b8d94b74400 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -399,12 +399,10 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> : class getLdStVDataRegisterOperand<RegisterClass RC, bit isTFE> { RegisterOperand tfeVDataOp = - !if(!eq(RC.Size, 32), AVLdSt_64, - !if(!eq(RC.Size, 64), AVLdSt_96, - !if(!eq(RC.Size, 96), AVLdSt_128, - !if(!eq(RC.Size, 128), AVLdSt_160, - RegisterOperand<VReg_1> // Invalid register. - )))); + !cond(!eq(RC.Size, 32) : AVLdSt_64, + !eq(RC.Size, 64) : AVLdSt_96, + !eq(RC.Size, 96) : AVLdSt_128, + !eq(RC.Size, 128) : AVLdSt_160); RegisterOperand ret = !if(isTFE, tfeVDataOp, getLdStRegisterOperand<RC>.ret); } @@ -534,7 +532,7 @@ multiclass MUBUF_Pseudo_Load_Pats_Common<string BaseInst, ValueType load_vt = i3 } multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag>{ - let SubtargetPredicate = HasUnrestrictedSOffset in { + let OtherPredicates = [HasUnrestrictedSOffset] in { defm : MUBUF_Pseudo_Load_Pats_Common<BaseInst, load_vt, ld>; } defm : MUBUF_Pseudo_Load_Pats_Common<BaseInst # "_VBUFFER", load_vt, ld>; @@ -631,7 +629,7 @@ multiclass MUBUF_Pseudo_Store_Pats_Common<string BaseInst, ValueType store_vt = } multiclass MUBUF_Pseudo_Store_Pats<string BaseInst, ValueType store_vt = i32, SDPatternOperator st = null_frag> { - let SubtargetPredicate = HasUnrestrictedSOffset in { + let OtherPredicates = [HasUnrestrictedSOffset] in { defm : MUBUF_Pseudo_Store_Pats_Common<BaseInst, store_vt, st>; } defm : MUBUF_Pseudo_Store_Pats_Common<BaseInst # "_VBUFFER", store_vt, st>; @@ -1151,27 +1149,21 @@ let SubtargetPredicate = isGFX6GFX7GFX10Plus in { defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics < "buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag >; +} + +let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in { defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics < "buffer_atomic_fmin", VGPR_32, f32, null_frag >; defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics < "buffer_atomic_fmax", VGPR_32, f32, null_frag >; - } let SubtargetPredicate = isGFX6GFX7GFX10 in { - defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics < "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag >; -defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_fmin_x2", VReg_64, f64, null_frag ->; -defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_fmax_x2", VReg_64, f64, null_frag ->; - } let SubtargetPredicate = HasD16LoadStore in { @@ -1235,12 +1227,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16 >; -let OtherPredicates = [HasAtomicFaddRtnInsts] in +let SubtargetPredicate = HasAtomicFaddRtnInsts in defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN< "buffer_atomic_add_f32", VGPR_32, f32, null_frag >; -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag >; @@ -1249,7 +1241,9 @@ let SubtargetPredicate = isGFX12Plus in { defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics < "buffer_atomic_cond_sub_u32", VGPR_32, i32 >; +} +let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in { let FPAtomic = 1 in defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics < "buffer_atomic_pk_add_bf16", VGPR_32, v2bf16 @@ -1320,6 +1314,9 @@ let SubtargetPredicate = isGFX90APlus in { let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in { defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>; + + // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2 + // depending on some subtargets. defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>; defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>; } // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 @@ -1421,18 +1418,22 @@ let OtherPredicates = [HasPackedD16VMem] in { defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i16, "BUFFER_LOAD_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i16, "BUFFER_LOAD_DWORD">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f16, "BUFFER_LOAD_DWORD">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">; +foreach vt = Reg32Types.types in { +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORD">; +} + +foreach vt = Reg64Types.types in { +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX2">; +} + +foreach vt = Reg96Types.types in { +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX3">; +} + +foreach vt = Reg128Types.types in { +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX4">; +} + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">; @@ -1495,6 +1496,7 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3f32, "BUFFER_STORE_FORMAT_XYZ">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMAT_XYZ">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">; @@ -1521,18 +1523,22 @@ let OtherPredicates = [HasPackedD16VMem] in { defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i16, "BUFFER_STORE_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i16, "BUFFER_STORE_DWORD">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f16, "BUFFER_STORE_DWORD">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i16, "BUFFER_STORE_DWORDX2">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f16, "BUFFER_STORE_DWORDX2">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">; +foreach vt = Reg32Types.types in { +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORD">; +} + +foreach vt = Reg64Types.types in { +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX2">; +} + +foreach vt = Reg96Types.types in { +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX3">; +} + +foreach vt = Reg128Types.types in { +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX4">; +} + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">; @@ -1545,7 +1551,7 @@ multiclass BufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst, bi defvar Op = !cast<SDPatternOperator>(OpPrefix # !if(!eq(RtnMode, "ret"), "", "_noret") - # !if(isIntr, "", "_" # vt.Size)); + # !if(isIntr, "", "_" # vt)); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { @@ -1582,7 +1588,7 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global" # !if(!eq(RtnMode, "ret"), "", "_noret") - # "_" # vt.Size); + # "_" # vt); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); defvar data_vt_RC = getVregSrcForVT<data_vt>.ret.RegClass; @@ -1641,6 +1647,16 @@ defm : BufferAtomicPat<"atomic_load_udec_wrap_global", Ty, "BUFFER_ATOMIC_DEC" # } // end foreach Ty +let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in { +defm : BufferAtomicPat<"atomic_load_fmin_global", f32, "BUFFER_ATOMIC_FMIN">; +defm : BufferAtomicPat<"atomic_load_fmax_global", f32, "BUFFER_ATOMIC_FMAX">; +} + +let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in { +defm : BufferAtomicPat<"atomic_load_fmin_global", f64, "BUFFER_ATOMIC_MIN_F64">; +defm : BufferAtomicPat<"atomic_load_fmax_global", f64, "BUFFER_ATOMIC_MAX_F64">; +} + defm : BufferAtomicCmpSwapPat<i32, v2i32, "BUFFER_ATOMIC_CMPSWAP">; defm : BufferAtomicCmpSwapPat<i64, v2i64, "BUFFER_ATOMIC_CMPSWAP_X2">; @@ -1695,9 +1711,11 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst, multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, list<string> RtnModes = ["ret", "noret"]> { - let SubtargetPredicate = HasUnrestrictedSOffset in { + let OtherPredicates = [HasUnrestrictedSOffset] in { defm : SIBufferAtomicPat_Common<OpPrefix, vt, Inst, RtnModes>; } + + // FIXME: This needs a !HasUnrestrictedSOffset predicate defm : SIBufferAtomicPat_Common<OpPrefix, vt, Inst # "_VBUFFER", RtnModes>; } @@ -1728,24 +1746,29 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i64, "BUFFER_ATOMIC_XOR_X2">; defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i64, "BUFFER_ATOMIC_INC_X2">; defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">; -let OtherPredicates = [HasAtomicCSubNoRtnInsts] in +let SubtargetPredicate = HasAtomicCSubNoRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>; +let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in { + defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16">; +} + let SubtargetPredicate = isGFX12Plus in { - defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd_bf16", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">; defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["ret"]>; +} - let OtherPredicates = [HasAtomicCSubNoRtnInsts] in - defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>; +let SubtargetPredicate = HasAtomicCSubNoRtnInsts in { +defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>; } -let OtherPredicates = [isGFX6GFX7GFX10Plus] in { +let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">; } -let SubtargetPredicate = isGFX6GFX7GFX10 in { - defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_FMIN_X2">; - defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_FMAX_X2">; + +let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in { + defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; + defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">; } class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag < @@ -1799,33 +1822,28 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, defm : BufferAtomicPatterns_NO_RTN_Common<name, vt, opcode # "_VBUFFER">; } -let OtherPredicates = [HasAtomicFaddNoRtnInsts] in +let SubtargetPredicate = HasAtomicFaddNoRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>; -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in { - let SubtargetPredicate = isGFX9Only in - defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>; - - let SubtargetPredicate = isGFX12Plus in - defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["noret"]>; -} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in { + defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>; +} // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts -let OtherPredicates = [HasAtomicFaddRtnInsts] in +let SubtargetPredicate = HasAtomicFaddRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>; -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { - let SubtargetPredicate = isGFX9Only in - defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; - - let SubtargetPredicate = isGFX12Plus in - defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["ret"]>; -} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in { + defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; +} // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts -let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in { +let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">; +} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 + +let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">; -} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 +} //End let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string Inst> { foreach RtnMode = ["ret", "noret"] in { @@ -1897,7 +1915,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri } multiclass SIBufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> { - let SubtargetPredicate = HasUnrestrictedSOffset in { + let OtherPredicates = [HasUnrestrictedSOffset] in { defm : SIBufferAtomicCmpSwapPat_Common<vt, data_vt, Inst>; } defm : SIBufferAtomicCmpSwapPat_Common<vt, data_vt, Inst # "_VBUFFER">; @@ -1948,7 +1966,7 @@ multiclass MUBUFLoad_PatternOffset_Common <string Instr, ValueType vt, multiclass MUBUFLoad_PatternOffset <string Instr, ValueType vt, PatFrag ld> { - let SubtargetPredicate = HasUnrestrictedSOffset in { + let OtherPredicates = [HasUnrestrictedSOffset] in { defm : MUBUFLoad_PatternOffset_Common<Instr, vt, ld>; } defm : MUBUFLoad_PatternOffset_Common<Instr # "_VBUFFER", vt, ld>; @@ -2189,7 +2207,7 @@ multiclass MTBUF_LoadIntrinsicPat_Common<SDPatternOperator name, ValueType vt, multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode, ValueType memoryVt = vt> { - let SubtargetPredicate = HasUnrestrictedSOffset in { + let OtherPredicates = [HasUnrestrictedSOffset] in { defm : MTBUF_LoadIntrinsicPat_Common<name, vt, opcode, memoryVt>; } defm : MTBUF_LoadIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>; @@ -2204,7 +2222,7 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3f32, "TBUFFER_LOAD_FORMAT_XYZ">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">; -let OtherPredicates = [HasUnpackedD16VMem] in { +let SubtargetPredicate = HasUnpackedD16VMem in { defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">; defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">; defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">; @@ -2212,7 +2230,7 @@ let OtherPredicates = [HasUnpackedD16VMem] in { defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. -let OtherPredicates = [HasPackedD16VMem] in { +let SubtargetPredicate = HasPackedD16VMem in { defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">; @@ -2261,7 +2279,7 @@ multiclass MTBUF_StoreIntrinsicPat_Common<SDPatternOperator name, ValueType vt, multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode, ValueType memoryVt = vt> { - let SubtargetPredicate = HasUnrestrictedSOffset in { + let OtherPredicates = [HasUnrestrictedSOffset] in { defm : MTBUF_StoreIntrinsicPat_Common<name, vt, opcode, memoryVt>; } defm : MTBUF_StoreIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>; @@ -2276,7 +2294,7 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY" defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3f32, "TBUFFER_STORE_FORMAT_XYZ">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">; -let OtherPredicates = [HasUnpackedD16VMem] in { +let SubtargetPredicate = HasUnpackedD16VMem in { defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">; defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X_gfx80">; defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">; @@ -2284,7 +2302,7 @@ let OtherPredicates = [HasUnpackedD16VMem] in { defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. -let OtherPredicates = [HasPackedD16VMem] in { +let SubtargetPredicate = HasPackedD16VMem in { defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">; @@ -2296,6 +2314,12 @@ let OtherPredicates = [HasPackedD16VMem] in { // Target-specific instruction encodings. //===----------------------------------------------------------------------===// +// Shortcut to default Mnemonic from BUF_Pseudo. Hides the cast to the +// specific pseudo (bothen in this case) since any of them will work. +class get_BUF_ps<string name> { + string Mnemonic = !cast<BUF_Pseudo>(name # "_OFFSET").Mnemonic; +} + //===----------------------------------------------------------------------===// // Base ENC_MUBUF for GFX6, GFX7, GFX10, GFX11. //===----------------------------------------------------------------------===// @@ -2327,8 +2351,8 @@ multiclass MUBUF_Real_gfx11<bits<8> op, string real_name = !cast<MUBUF_Pseudo>(N } } -class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> : - Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, ef> { +class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef, string asmName> : + Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, ef, asmName> { let Inst{12} = ps.offen; let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value); @@ -2338,9 +2362,10 @@ class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> : let Inst{55} = ps.tfe; } -multiclass MUBUF_Real_gfx10<bits<8> op> { - defvar ps = !cast<MUBUF_Pseudo>(NAME); - def _gfx10 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10> { +multiclass MUBUF_Real_gfx10<bits<8> op, string psName = NAME, + string asmName = !cast<MUBUF_Pseudo>(psName).Mnemonic> { + defvar ps = !cast<MUBUF_Pseudo>(psName); + def _gfx10 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10, asmName> { let Inst{15} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value); let Inst{25} = op{7}; let AssemblerPredicate = isGFX10Only; @@ -2348,9 +2373,10 @@ multiclass MUBUF_Real_gfx10<bits<8> op> { } } -multiclass MUBUF_Real_gfx6_gfx7<bits<8> op> { - defvar ps = !cast<MUBUF_Pseudo>(NAME); - def _gfx6_gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> { +multiclass MUBUF_Real_gfx6_gfx7<bits<8> op, string psName = NAME, + string asmName = !cast<MUBUF_Pseudo>(psName).Mnemonic> { + defvar ps = !cast<MUBUF_Pseudo>(psName); + def _gfx6_gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI, asmName> { let Inst{15} = ps.addr64; let AssemblerPredicate = isGFX6GFX7; let DecoderNamespace = "GFX6GFX7"; @@ -2359,7 +2385,7 @@ multiclass MUBUF_Real_gfx6_gfx7<bits<8> op> { multiclass MUBUF_Real_gfx6<bits<8> op> { defvar ps = !cast<MUBUF_Pseudo>(NAME); - def _gfx6 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> { + def _gfx6 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI, ps.Mnemonic> { let Inst{15} = ps.addr64; let AssemblerPredicate = isGFX6; let DecoderNamespace = "GFX6"; @@ -2368,7 +2394,7 @@ multiclass MUBUF_Real_gfx6<bits<8> op> { multiclass MUBUF_Real_gfx7<bits<8> op> { defvar ps = !cast<MUBUF_Pseudo>(NAME); - def _gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> { + def _gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI, ps.Mnemonic> { let Inst{15} = ps.addr64; let AssemblerPredicate = isGFX7Only; let DecoderNamespace = "GFX7"; @@ -2445,9 +2471,15 @@ class VBUFFER_Real_gfx12<bits<8> op, BUF_Pseudo ps, string real_name> : multiclass VBUFFER_MUBUF_Real_gfx12<bits<8> op, string real_name> { defvar ps = !cast<MUBUF_Pseudo>(NAME); def _gfx12 : VBUFFER_Real_gfx12<op, ps, real_name> { - // Set the last bit of format to 1 to avoid round-trip issues, as some tools + // Set the format field to be 1 to avoid round-trip issues, as some tools // print BUF_FMT_INVALID for format 0. - let Inst{55} = 0b1; + let Inst{61-55} = 0b0000001; + } + // Have a version of the instruction to disassemble to for any other + // format field values. + def _gfx12_format : VBUFFER_Real<op, ps, real_name> { + let AsmVariantName = "NonParsable"; + let DecoderNamespace = "GFX12"; } } @@ -2463,12 +2495,6 @@ multiclass VBUFFER_MTBUF_Real_gfx12<bits<4> op, string real_name> { // MUBUF - GFX11, GFX12. //===----------------------------------------------------------------------===// -// Shortcut to default Mnemonic from BUF_Pseudo. Hides the cast to the -// specific pseudo (bothen in this case) since any of them will work. -class get_BUF_ps<string name> { - string Mnemonic = !cast<BUF_Pseudo>(name # "_BOTHEN").Mnemonic; -} - // gfx11 instruction that accept both old and new assembler name. class Mnem_gfx11_gfx12 <string mnemonic, string real_name> : AMDGPUMnemonicAlias<mnemonic, real_name> { @@ -2690,18 +2716,20 @@ multiclass MUBUF_Real_AllAddr_Lds_gfx10<bits<8> op, bit isTFE = 0> { defm _LDS_BOTHEN : MUBUF_Real_gfx10<op>; } } -multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op> { - defm _BOTHEN_RTN : MUBUF_Real_gfx10<op>; - defm _IDXEN_RTN : MUBUF_Real_gfx10<op>; - defm _OFFEN_RTN : MUBUF_Real_gfx10<op>; - defm _OFFSET_RTN : MUBUF_Real_gfx10<op>; +multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op, string psName = NAME, + string asmName = !cast<MUBUF_Pseudo>(psName).Mnemonic> { + defm _BOTHEN_RTN : MUBUF_Real_gfx10<op, psName#"_BOTHEN_RTN", asmName>; + defm _IDXEN_RTN : MUBUF_Real_gfx10<op, psName#"_IDXEN_RTN", asmName>; + defm _OFFEN_RTN : MUBUF_Real_gfx10<op, psName#"_OFFEN_RTN", asmName>; + defm _OFFSET_RTN : MUBUF_Real_gfx10<op, psName#"_OFFSET_RTN", asmName>; } -multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> : - MUBUF_Real_Atomics_RTN_gfx10<op> { - defm _BOTHEN : MUBUF_Real_gfx10<op>; - defm _IDXEN : MUBUF_Real_gfx10<op>; - defm _OFFEN : MUBUF_Real_gfx10<op>; - defm _OFFSET : MUBUF_Real_gfx10<op>; +multiclass MUBUF_Real_Atomics_gfx10<bits<8> op, string psName = NAME, + string asmName = get_BUF_ps<psName>.Mnemonic> : + MUBUF_Real_Atomics_RTN_gfx10<op, psName, asmName> { + defm _BOTHEN : MUBUF_Real_gfx10<op, psName#"_BOTHEN", asmName>; + defm _IDXEN : MUBUF_Real_gfx10<op, psName#"_IDXEN", asmName>; + defm _OFFEN : MUBUF_Real_gfx10<op, psName#"_OFFEN", asmName>; + defm _OFFSET : MUBUF_Real_gfx10<op, psName#"_OFFSET", asmName>; } defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>; @@ -2756,18 +2784,18 @@ multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7<bits<8> op, bit isTFE = 0> { defm _LDS_BOTHEN : MUBUF_Real_gfx6_gfx7<op>; } } -multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> { - defm _ADDR64 : MUBUF_Real_gfx6_gfx7<op>; - defm _BOTHEN : MUBUF_Real_gfx6_gfx7<op>; - defm _IDXEN : MUBUF_Real_gfx6_gfx7<op>; - defm _OFFEN : MUBUF_Real_gfx6_gfx7<op>; - defm _OFFSET : MUBUF_Real_gfx6_gfx7<op>; +multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op, string psName, string asmName> { + defm _ADDR64 : MUBUF_Real_gfx6_gfx7<op, psName#"_ADDR64", asmName>; + defm _BOTHEN : MUBUF_Real_gfx6_gfx7<op, psName#"_BOTHEN", asmName>; + defm _IDXEN : MUBUF_Real_gfx6_gfx7<op, psName#"_IDXEN", asmName>; + defm _OFFEN : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFEN", asmName>; + defm _OFFSET : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFSET", asmName>; - defm _ADDR64_RTN : MUBUF_Real_gfx6_gfx7<op>; - defm _BOTHEN_RTN : MUBUF_Real_gfx6_gfx7<op>; - defm _IDXEN_RTN : MUBUF_Real_gfx6_gfx7<op>; - defm _OFFEN_RTN : MUBUF_Real_gfx6_gfx7<op>; - defm _OFFSET_RTN : MUBUF_Real_gfx6_gfx7<op>; + defm _ADDR64_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_ADDR64_RTN", asmName>; + defm _BOTHEN_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_BOTHEN_RTN", asmName>; + defm _IDXEN_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_IDXEN_RTN", asmName>; + defm _OFFEN_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFEN_RTN", asmName>; + defm _OFFSET_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFSET_RTN", asmName>; } multiclass MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<bits<8> op> : @@ -2782,8 +2810,10 @@ multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<bits<8> op> { defm _TFE : MUBUF_Real_AllAddr_Lds_Helper_gfx6_gfx7_gfx10<op, 1>; } -multiclass MUBUF_Real_Atomics_gfx6_gfx7_gfx10<bits<8> op> : - MUBUF_Real_Atomics_gfx6_gfx7<op>, MUBUF_Real_Atomics_gfx10<op>; +multiclass MUBUF_Real_Atomics_gfx6_gfx7_gfx10<bits<8> op, string psName = NAME, + string asmName = get_BUF_ps<psName>.Mnemonic> : + MUBUF_Real_Atomics_gfx6_gfx7<op, psName, asmName>, + MUBUF_Real_Atomics_gfx10<op, psName, asmName>; // FIXME-GFX6: Following instructions are available only on GFX6. //defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomics_gfx6 <0x034>; @@ -2843,8 +2873,8 @@ defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05c>; defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05d>; // FIXME-GFX7: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on GFX7. defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>; -defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>; -defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>; +defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f, "BUFFER_ATOMIC_MIN_F64", "buffer_atomic_fmin_x2">; +defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060, "BUFFER_ATOMIC_MAX_F64", "buffer_atomic_fmax_x2">; defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomics_gfx10<0x034>; @@ -3066,9 +3096,9 @@ multiclass MUBUF_Real_vi_gfx90a<bits<7> op, bit isTFE = 0> : MUBUF_Real_vi<op> { } if ps.FPAtomic then { - let SubtargetPredicate = isGFX90AOnly, - AssemblerPredicate = isGFX90AOnly in - defm NAME : MUBUF_Real_gfx90a<op, 0>; + let AssemblerPredicate = isGFX90AOnly in + defm NAME : MUBUF_Real_gfx90a<op, 0>; + def _gfx940 : MUBUF_Real_gfx940<op, ps>; } } @@ -3251,10 +3281,7 @@ defm BUFFER_WBINVL1_VOL : MUBUF_Real_vi <0x3f>; defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>; - -let SubtargetPredicate = HasAtomicFaddNoRtnInsts in { defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>; -} // End SubtargetPredicate = HasAtomicFaddNoRtnInsts let SubtargetPredicate = isGFX90APlus in { defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_vi<0x4f>; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 19bb4300531c..219246b71fe8 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -965,16 +965,16 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">; multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt)>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt)>; } let OtherPredicates = [HasGDS] in { - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt), /* complexity */ 0, /* gds */ 1>; } } @@ -983,24 +983,24 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_m0_"#vt)>; def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size), /* complexity */ 1>; + !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt)>; def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>; + !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>; } let OtherPredicates = [HasGDS] in { def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + !cast<PatFrag>(frag#"_region_m0_"#vt), /* complexity */ 0, /* gds */ 1>; def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + !cast<PatFrag>(frag#"_region_m0_noret_"#vt), /* complexity */ 1, /* gds */ 1>; } } @@ -1019,23 +1019,23 @@ class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, multiclass DSAtomicCmpXChgSwapped_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; - def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size), + def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt)>; + def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt)>; def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size), + !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>; } let OtherPredicates = [HasGDS] in { - def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt), /* complexity */ 0, /* gds */ 1>; - def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt), /* complexity */ 1, /* gds */ 1>; } } @@ -1053,14 +1053,14 @@ class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt)>; def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>; + !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>; let OtherPredicates = [HasGDS] in { - def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt), /* complexity */ 0, /* gds */ 1>; - def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt), /* complexity */ 1, /* gds */ 1>; } } @@ -1082,6 +1082,12 @@ defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_U32, DS_MAX_U32, i32, "atomic_load_umax defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_F32, DS_MIN_F32, f32, "atomic_load_fmin">; defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_F32, DS_MAX_F32, f32, "atomic_load_fmax">; + +let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { +defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">; +defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">; +} + let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in { defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">; } @@ -1119,9 +1125,9 @@ defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B64, DS_CMPSTORE_B64, i64, "atomic_cmp } // End SubtargetPredicate = isGFX11Plus let SubtargetPredicate = HasLdsAtomicAddF64 in { -def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>; +def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_f64>; let AddedComplexity = 1 in -def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>; +def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_f64>; class DSAtomicRetPatIntrinsic<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < @@ -1135,18 +1141,7 @@ def : DSAtomicRetPatIntrinsic<DS_ADD_F64, f64, int_amdgcn_flat_atomic_fadd_noret } let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { -def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>; -let AddedComplexity = 1 in -def : DSAtomicRetPat<DS_PK_ADD_F16, v2f16, atomic_load_fadd_v2f16_local_noret_32>; -def : GCNPat < - (v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)), - (DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) ->; -let AddedComplexity = 1 in -def : GCNPat < - (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)), - (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) ->; +defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">; } // End SubtargetPredicate = HasAtomicDsPkAdd16Insts let OtherPredicates = [HasGDS] in diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 05063c6c321a..76a559c9443b 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -21,6 +21,7 @@ #include "SIDefines.h" #include "SIRegisterInfo.h" #include "TargetInfo/AMDGPUTargetInfo.h" +#include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm-c/DisassemblerTypes.h" #include "llvm/BinaryFormat/ELF.h" @@ -52,6 +53,13 @@ AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI, // ToDo: AMDGPUDisassembler supports only VI ISA. if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus()) report_fatal_error("Disassembly not yet supported for subtarget"); + + for (auto [Symbol, Code] : AMDGPU::UCVersion::getGFXVersions()) + createConstantSymbolExpr(Symbol, Code); + + UCVersionW64Expr = createConstantSymbolExpr("UC_VERSION_W64_BIT", 0x2000); + UCVersionW32Expr = createConstantSymbolExpr("UC_VERSION_W32_BIT", 0x4000); + UCVersionMDPExpr = createConstantSymbolExpr("UC_VERSION_MDP_BIT", 0x8000); } void AMDGPUDisassembler::setABIVersion(unsigned Version) { @@ -421,6 +429,13 @@ DECODE_SDWA(Src32) DECODE_SDWA(Src16) DECODE_SDWA(VopcDst) +static DecodeStatus decodeVersionImm(MCInst &Inst, unsigned Imm, + uint64_t /* Addr */, + const MCDisassembler *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); + return addOperand(Inst, DAsm->decodeVersionImm(Imm)); +} + #include "AMDGPUGenDisassemblerTables.inc" //===----------------------------------------------------------------------===// @@ -1727,6 +1742,41 @@ MCOperand AMDGPUDisassembler::decodeDpp8FI(unsigned Val) const { return MCOperand::createImm(Val); } +MCOperand AMDGPUDisassembler::decodeVersionImm(unsigned Imm) const { + using VersionField = AMDGPU::EncodingField<7, 0>; + using W64Bit = AMDGPU::EncodingBit<13>; + using W32Bit = AMDGPU::EncodingBit<14>; + using MDPBit = AMDGPU::EncodingBit<15>; + using Encoding = AMDGPU::EncodingFields<VersionField, W64Bit, W32Bit, MDPBit>; + + auto [Version, W64, W32, MDP] = Encoding::decode(Imm); + + // Decode into a plain immediate if any unused bits are raised. + if (Encoding::encode(Version, W64, W32, MDP) != Imm) + return MCOperand::createImm(Imm); + + const auto &Versions = AMDGPU::UCVersion::getGFXVersions(); + auto I = find_if(Versions, + [Version = Version](const AMDGPU::UCVersion::GFXVersion &V) { + return V.Code == Version; + }); + MCContext &Ctx = getContext(); + const MCExpr *E; + if (I == Versions.end()) + E = MCConstantExpr::create(Version, Ctx); + else + E = MCSymbolRefExpr::create(Ctx.getOrCreateSymbol(I->Symbol), Ctx); + + if (W64) + E = MCBinaryExpr::createOr(E, UCVersionW64Expr, Ctx); + if (W32) + E = MCBinaryExpr::createOr(E, UCVersionW32Expr, Ctx); + if (MDP) + E = MCBinaryExpr::createOr(E, UCVersionMDPExpr, Ctx); + + return MCOperand::createExpr(E); +} + bool AMDGPUDisassembler::isVI() const { return STI.hasFeature(AMDGPU::FeatureVolcanicIslands); } @@ -2312,6 +2362,15 @@ Expected<bool> AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, return false; } +const MCExpr *AMDGPUDisassembler::createConstantSymbolExpr(StringRef Id, + int64_t Val) { + MCContext &Ctx = getContext(); + MCSymbol *Sym = Ctx.getOrCreateSymbol(Id); + assert(!Sym->isVariable()); + Sym->setVariableValue(MCConstantExpr::create(Val, Ctx)); + return MCSymbolRefExpr::create(Sym, Ctx); +} + //===----------------------------------------------------------------------===// // AMDGPUSymbolizer //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 2061d83af3da..694cd7a9bfd2 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -102,6 +102,11 @@ private: mutable bool HasLiteral; mutable std::optional<bool> EnableWavefrontSize32; unsigned CodeObjectVersion; + const MCExpr *UCVersionW64Expr; + const MCExpr *UCVersionW32Expr; + const MCExpr *UCVersionMDPExpr; + + const MCExpr *createConstantSymbolExpr(StringRef Id, int64_t Val); public: AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, @@ -264,6 +269,8 @@ public: MCOperand decodeSplitBarrier(unsigned Val) const; MCOperand decodeDpp8FI(unsigned Val) const; + MCOperand decodeVersionImm(unsigned Imm) const; + int getTTmpIdx(unsigned Val) const; const MCInstrInfo *getMCII() const { return MCII.get(); } diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 3767dd0b6d47..280def5440c8 100644 --- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -322,25 +322,25 @@ def : EGOrCaymanPat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$ $ptr), sub1)>; defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_NORET, - atomic_swap_global_noret_32>; + atomic_swap_global_noret_i32>; defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_NORET, - atomic_load_add_global_noret_32>; + atomic_load_add_global_noret_i32>; defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_NORET, - atomic_load_sub_global_noret_32>; + atomic_load_sub_global_noret_i32>; defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_NORET, - atomic_load_min_global_noret_32>; + atomic_load_min_global_noret_i32>; defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_NORET, - atomic_load_umin_global_noret_32>; + atomic_load_umin_global_noret_i32>; defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_NORET, - atomic_load_max_global_noret_32>; + atomic_load_max_global_noret_i32>; defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_NORET, - atomic_load_umax_global_noret_32>; + atomic_load_umax_global_noret_i32>; defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_NORET, - atomic_load_and_global_noret_32>; + atomic_load_and_global_noret_i32>; defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_NORET, - atomic_load_or_global_noret_32>; + atomic_load_or_global_noret_i32>; defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_NORET, - atomic_load_xor_global_noret_32>; + atomic_load_xor_global_noret_i32>; // Should be predicated on FeatureFP64 // def FMA_64 : R600_3OP < @@ -712,37 +712,37 @@ def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE", [(truncstorei16_local i32:$src1, i32:$src0)] >; def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD", - [(set i32:$dst, (atomic_load_add_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_add_local_i32 i32:$src0, i32:$src1))] >; def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB", - [(set i32:$dst, (atomic_load_sub_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_sub_local_i32 i32:$src0, i32:$src1))] >; def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND", - [(set i32:$dst, (atomic_load_and_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_and_local_i32 i32:$src0, i32:$src1))] >; def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR", - [(set i32:$dst, (atomic_load_or_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_or_local_i32 i32:$src0, i32:$src1))] >; def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR", - [(set i32:$dst, (atomic_load_xor_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_xor_local_i32 i32:$src0, i32:$src1))] >; def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT", - [(set i32:$dst, (atomic_load_min_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_min_local_i32 i32:$src0, i32:$src1))] >; def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT", - [(set i32:$dst, (atomic_load_max_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_max_local_i32 i32:$src0, i32:$src1))] >; def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT", - [(set i32:$dst, (atomic_load_umin_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_umin_local_i32 i32:$src0, i32:$src1))] >; def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT", - [(set i32:$dst, (atomic_load_umax_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_umax_local_i32 i32:$src0, i32:$src1))] >; def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG", - [(set i32:$dst, (atomic_swap_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_swap_local_i32 i32:$src0, i32:$src1))] >; def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST", - [(set i32:$dst, (atomic_cmp_swap_local_32 i32:$src0, i32:$src1, i32:$src2))] + [(set i32:$dst, (atomic_cmp_swap_local_i32 i32:$src0, i32:$src1, i32:$src2))] >; def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", [(set (i32 R600_Reg32:$dst), (load_local R600_Reg32:$src0))] diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index aab19b8adc27..98054dde398b 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -752,25 +752,29 @@ defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2", // GFX7-, GFX10-only flat instructions. let SubtargetPredicate = isGFX7GFX10 in { - defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2", VReg_64, f64, v2f64, VReg_128>; +} // End SubtargetPredicate = isGFX7GFX10 -defm FLAT_ATOMIC_FMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2", - VReg_64, f64>; -defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2", - VReg_64, f64>; +// The names may be flat_atomic_fmin_x2 on some subtargets, but we +// choose this as the canonical name. +let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in { +defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo <"flat_atomic_min_f64", + VReg_64, f64>; -} // End SubtargetPredicate = isGFX7GFX10 +defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo <"flat_atomic_max_f64", + VReg_64, f64>; +} + +let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in { +defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>; +defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>; +} let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in { defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>; - defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64>; - defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64>; defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>; - defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>; - defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>; } // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 let SubtargetPredicate = HasAtomicFlatPkAdd16Insts in { @@ -972,6 +976,15 @@ defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_s defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">; let SubtargetPredicate = isGFX12Plus in { + let Uses = [EXEC, M0] in { + defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>; + defm GLOBAL_STORE_BLOCK : FLAT_Global_Store_Pseudo <"global_store_block", VReg_1024>; + } + let Uses = [EXEC, FLAT_SCR, M0] in { + defm SCRATCH_LOAD_BLOCK : FLAT_Scratch_Load_Pseudo <"scratch_load_block", VReg_1024>; + defm SCRATCH_STORE_BLOCK : FLAT_Scratch_Store_Pseudo <"scratch_store_block", VReg_1024>; + } + let WaveSizePredicate = isWave32 in { let Mnemonic = "global_load_tr_b128" in defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w32", VReg_128>; @@ -995,10 +1008,6 @@ let SubtargetPredicate = isGFX10Plus in { FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>; defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, v2f64, VReg_128>; - defm GLOBAL_ATOMIC_FMIN_X2 : - FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>; - defm GLOBAL_ATOMIC_FMAX_X2 : - FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>; } // End SubtargetPredicate = isGFX10Plus let OtherPredicates = [HasAtomicFaddNoRtnInsts] in @@ -1105,7 +1114,7 @@ multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addr multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : - FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt.Size), vt, data_vt>; + FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>; multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt, @@ -1123,7 +1132,7 @@ multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSp multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : - FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt.Size), vt, data_vt>; + FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt), vt, data_vt>; multiclass FlatAtomicPat <string inst, string node, ValueType vt, @@ -1155,8 +1164,8 @@ class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node, multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt, ValueType data_vt = vt, int complexity = 0, bit isIntr = 0> { - defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt.Size)); - defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); + defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt)); + defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt)); let AddedComplexity = complexity in def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>; @@ -1165,21 +1174,6 @@ multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt, def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), noRtnNode, vt, data_vt>; } -multiclass FlatSignedAtomicIntrPat <string inst, string node, ValueType vt, - ValueType data_vt = vt> { - defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* complexity */ 0, /* isIntr */ 1>; -} - -multiclass FlatSignedAtomicPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix, - ValueType vt, ValueType data_vt = vt> { - defvar noRtnNode = !cast<PatFrags>(intr # "_noret_" # addrSpaceSuffix); - defvar rtnNode = !cast<PatFrags>(intr # "_" # addrSpaceSuffix); - - let AddedComplexity = 1 in - def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), noRtnNode, vt, data_vt>; - def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>; -} - class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))), (inst $vaddr, $offset) @@ -1280,11 +1274,11 @@ multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt, multiclass GlobalFLATAtomicPatsNoRtn<string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : - GlobalFLATAtomicPatsNoRtnBase<inst, node # "_noret" # !if(isIntr, "", "_" # vt.Size), vt, data_vt>; + GlobalFLATAtomicPatsNoRtnBase<inst, node # "_noret" # !if(isIntr, "", "_" # vt), vt, data_vt>; multiclass GlobalFLATAtomicPatsRtn<string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : - GlobalFLATAtomicPatsRtnBase<inst, node # !if(isIntr, "", "_" # vt.Size), vt, data_vt>; + GlobalFLATAtomicPatsRtnBase<inst, node # !if(isIntr, "", "_" # vt), vt, data_vt>; multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : @@ -1431,6 +1425,17 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_OR_X2", "atomic_load_or_"#as, i64>; defm : FlatAtomicPat <"FLAT_ATOMIC_SWAP_X2", "atomic_swap_"#as, i64>; defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_"#as, i64, v2i64>; defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>; + +let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in { +defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_"#as, f32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_"#as, f32>; +} + +let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in { +defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>; +} + } // end foreach as let SubtargetPredicate = isGFX12Plus in { @@ -1592,37 +1597,26 @@ let OtherPredicates = [isGFX12Plus] in { } } -let OtherPredicates = [isGFX10Plus] in { +let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>; -} - -let OtherPredicates = [isGFX10GFX11] in { defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>; - -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>; } -let OtherPredicates = [isGFX10Only] in { -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN_X2", "atomic_load_fmin_flat", f64>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX_X2", "atomic_load_fmax_flat", f64>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", "int_amdgcn_flat_atomic_fmin", f64>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", "int_amdgcn_flat_atomic_fmax", f64>; +let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in { +defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>; } let OtherPredicates = [isGFX12Only] in { + // FIXME: Remove these intrinsics defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>; - defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>; - defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>; + defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>; + defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>; } let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { @@ -1645,37 +1639,44 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; } -let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in { -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; +let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; -defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>; -defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>; -defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", f64>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>; +} + +let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in { +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>; +} + +let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in { +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; +defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>; +defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>; } let OtherPredicates = [HasFlatAtomicFaddF32Inst] in { -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; -defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", f32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>; } let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in { -defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; +defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>; +defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>; } let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; - +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>; } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { @@ -1745,8 +1746,8 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f // CI //===----------------------------------------------------------------------===// -class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps> : - FLAT_Real <op, ps>, +class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps, string asmName = ps.Mnemonic> : + FLAT_Real <op, ps, asmName>, SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SI> { let AssemblerPredicate = isGFX7Only; let DecoderNamespace="GFX7"; @@ -1768,10 +1769,13 @@ def FLAT_STORE_DWORDX2_ci : FLAT_Real_ci <0x1d, FLAT_STORE_DWORDX2>; def FLAT_STORE_DWORDX4_ci : FLAT_Real_ci <0x1e, FLAT_STORE_DWORDX4>; def FLAT_STORE_DWORDX3_ci : FLAT_Real_ci <0x1f, FLAT_STORE_DWORDX3>; -multiclass FLAT_Real_Atomics_ci <bits<7> op> { - defvar ps = !cast<FLAT_Pseudo>(NAME); - def _ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>; - def _RTN_ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>; +multiclass FLAT_Real_Atomics_ci <bits<7> op, string opName = NAME, + string asmName = !cast<FLAT_Pseudo>(opName).Mnemonic> { + defvar ps = !cast<FLAT_Pseudo>(opName); + defvar ps_rtn = !cast<FLAT_Pseudo>(opName#"_RTN"); + + def _ci : FLAT_Real_ci<op, ps, asmName>; + def _RTN_ci : FLAT_Real_ci<op, ps_rtn, asmName>; } defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_ci <0x30>; @@ -1806,8 +1810,8 @@ defm FLAT_ATOMIC_FCMPSWAP : FLAT_Real_Atomics_ci <0x3e>; defm FLAT_ATOMIC_FMIN : FLAT_Real_Atomics_ci <0x3f>; defm FLAT_ATOMIC_FMAX : FLAT_Real_Atomics_ci <0x40>; defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_ci <0x5e>; -defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_ci <0x5f>; -defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60>; +defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_ci <0x5f, "FLAT_ATOMIC_MIN_F64", "flat_atomic_fmin_x2">; +defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60, "FLAT_ATOMIC_MAX_F64", "flat_atomic_fmax_x2">; //===----------------------------------------------------------------------===// @@ -2089,8 +2093,8 @@ let SubtargetPredicate = isGFX940Plus in { // GFX10. //===----------------------------------------------------------------------===// -class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> : - FLAT_Real<op, ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10> { +class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : + FLAT_Real<op, ps, opName>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10> { let AssemblerPredicate = isGFX10Only; let DecoderNamespace = "GFX10"; @@ -2102,25 +2106,28 @@ class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> : let Inst{55} = 0; } - -multiclass FLAT_Real_Base_gfx10<bits<7> op> { +multiclass FLAT_Real_Base_gfx10<bits<7> op, string psName = NAME, + string asmName = !cast<FLAT_Pseudo>(psName).Mnemonic> { def _gfx10 : - FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME)>; + FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName), asmName>; } -multiclass FLAT_Real_RTN_gfx10<bits<7> op> { +multiclass FLAT_Real_RTN_gfx10<bits<7> op, string psName = NAME, + string asmName = !cast<FLAT_Pseudo>(psName).Mnemonic> { def _RTN_gfx10 : - FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_RTN")>; + FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName#"_RTN"), asmName>; } -multiclass FLAT_Real_SADDR_gfx10<bits<7> op> { +multiclass FLAT_Real_SADDR_gfx10<bits<7> op, string psName = NAME, + string asmName = !cast<FLAT_Pseudo>(psName#"_SADDR").Mnemonic> { def _SADDR_gfx10 : - FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>; + FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName#"_SADDR"), asmName>; } -multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op> { +multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op, string psName = NAME, + string asmName = !cast<FLAT_Pseudo>(psName#"_SADDR_RTN").Mnemonic> { def _SADDR_RTN_gfx10 : - FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>; + FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName#"_SADDR_RTN"), asmName>; } multiclass FLAT_Real_ST_gfx10<bits<7> op> { @@ -2128,22 +2135,25 @@ multiclass FLAT_Real_ST_gfx10<bits<7> op> { FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_ST")>; } -multiclass FLAT_Real_AllAddr_gfx10<bits<7> op> : - FLAT_Real_Base_gfx10<op>, - FLAT_Real_SADDR_gfx10<op>; +multiclass FLAT_Real_AllAddr_gfx10<bits<7> op, string OpName = NAME, + string asmName = !cast<FLAT_Pseudo>(OpName).Mnemonic> : + FLAT_Real_Base_gfx10<op, OpName, asmName>, + FLAT_Real_SADDR_gfx10<op, OpName, asmName>; -multiclass FLAT_Real_Atomics_gfx10<bits<7> op> : - FLAT_Real_Base_gfx10<op>, - FLAT_Real_RTN_gfx10<op>; +multiclass FLAT_Real_Atomics_gfx10<bits<7> op, string OpName = NAME, + string asmName = !cast<FLAT_Pseudo>(OpName).Mnemonic> : + FLAT_Real_Base_gfx10<op, OpName, asmName>, + FLAT_Real_RTN_gfx10<op, OpName, asmName>; -multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op> : - FLAT_Real_AllAddr_gfx10<op>, - FLAT_Real_RTN_gfx10<op>, - FLAT_Real_SADDR_RTN_gfx10<op>; +multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op, string OpName = NAME, + string asmName = !cast<FLAT_Pseudo>(OpName).Mnemonic> : + FLAT_Real_AllAddr_gfx10<op, OpName, asmName>, + FLAT_Real_RTN_gfx10<op, OpName, asmName>, + FLAT_Real_SADDR_RTN_gfx10<op, OpName, asmName>; -multiclass FLAT_Real_GlblAtomics_RTN_gfx10<bits<7> op> : - FLAT_Real_RTN_gfx10<op>, - FLAT_Real_SADDR_RTN_gfx10<op>; +multiclass FLAT_Real_GlblAtomics_RTN_gfx10<bits<7> op, string OpName = NAME> : + FLAT_Real_RTN_gfx10<op, OpName>, + FLAT_Real_SADDR_RTN_gfx10<op, OpName>; multiclass FLAT_Real_ScratchAllAddr_gfx10<bits<7> op> : FLAT_Real_Base_gfx10<op>, @@ -2220,8 +2230,8 @@ defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_gfx10<0x05b>; defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_gfx10<0x05c>; defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_gfx10<0x05d>; defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_gfx10<0x05e>; -defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_gfx10<0x05f>; -defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_gfx10<0x060>; +defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_gfx10<0x05f, "FLAT_ATOMIC_MIN_F64", "flat_atomic_fmin_x2">; +defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_gfx10<0x060, "FLAT_ATOMIC_MAX_F64", "flat_atomic_fmax_x2">; // ENC_FLAT_GLBL. @@ -2278,8 +2288,8 @@ defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Real_GlblAtomics_gfx10<0x05b>; defm GLOBAL_ATOMIC_INC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05c>; defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05d>; defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x05e>; -defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f>; -defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>; +defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f, "GLOBAL_ATOMIC_MIN_F64", "global_atomic_fmin_x2">; +defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060, "GLOBAL_ATOMIC_MAX_F64", "global_atomic_fmax_x2">; defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x016>; defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x017>; @@ -2671,6 +2681,8 @@ defm GLOBAL_STORE_BYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_s defm GLOBAL_STORE_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">; defm GLOBAL_LOAD_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">; defm GLOBAL_STORE_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">; +defm GLOBAL_LOAD_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x053>; +defm GLOBAL_STORE_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x054>; defm GLOBAL_ATOMIC_SWAP : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">; defm GLOBAL_ATOMIC_CMPSWAP : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">; @@ -2741,3 +2753,6 @@ defm SCRATCH_LOAD_SBYTE_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x22, "scratch_ defm SCRATCH_LOAD_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x23, "scratch_load_d16_hi_b16">; defm SCRATCH_STORE_BYTE_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x24, "scratch_store_d16_hi_b8">; defm SCRATCH_STORE_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x25, "scratch_store_d16_hi_b16">; + +defm SCRATCH_LOAD_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x53>; +defm SCRATCH_STORE_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x54>; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 94d93390d091..217279211531 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -116,31 +116,112 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n"); } +/// Checks whether \p SU can use the cached DAG pressure diffs to compute the +/// current register pressure. +/// +/// This works for the common case, but it has a few exceptions that have been +/// observed through trial and error: +/// - Explicit physical register operands +/// - Subregister definitions +/// +/// In both of those cases, PressureDiff doesn't represent the actual pressure, +/// and querying LiveIntervals through the RegPressureTracker is needed to get +/// an accurate value. +/// +/// We should eventually only use PressureDiff for maximum performance, but this +/// already allows 80% of SUs to take the fast path without changing scheduling +/// at all. Further changes would either change scheduling, or require a lot +/// more logic to recover an accurate pressure estimate from the PressureDiffs. +static bool canUsePressureDiffs(const SUnit &SU) { + if (!SU.isInstr()) + return false; + + // Cannot use pressure diffs for subregister defs or with physregs, it's + // imprecise in both cases. + for (const auto &Op : SU.getInstr()->operands()) { + if (!Op.isReg() || Op.isImplicit()) + continue; + if (Op.getReg().isPhysical() || + (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister)) + return false; + } + return true; +} + +static void getRegisterPressures(bool AtTop, + const RegPressureTracker &RPTracker, SUnit *SU, + std::vector<unsigned> &Pressure, + std::vector<unsigned> &MaxPressure) { + // getDownwardPressure() and getUpwardPressure() make temporary changes to + // the tracker, so we need to pass those function a non-const copy. + RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); + if (AtTop) + TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); + else + TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); +} + void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, - unsigned VGPRPressure) { + unsigned VGPRPressure, bool IsBottomUp) { Cand.SU = SU; Cand.AtTop = AtTop; if (!DAG->isTrackingPressure()) return; - // getDownwardPressure() and getUpwardPressure() make temporary changes to - // the tracker, so we need to pass those function a non-const copy. - RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); - Pressure.clear(); MaxPressure.clear(); - if (AtTop) - TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); - else { - // FIXME: I think for bottom up scheduling, the register pressure is cached - // and can be retrieved by DAG->getPressureDif(SU). - TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); + // We try to use the cached PressureDiffs in the ScheduleDAG whenever + // possible over querying the RegPressureTracker. + // + // RegPressureTracker will make a lot of LIS queries which are very + // expensive, it is considered a slow function in this context. + // + // PressureDiffs are precomputed and cached, and getPressureDiff is just a + // trivial lookup into an array. It is pretty much free. + // + // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of + // PressureDiffs. + if (AtTop || !canUsePressureDiffs(*SU)) { + getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure); + } else { + // Reserve 4 slots. + Pressure.resize(4, 0); + Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure; + Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure; + + for (const auto &Diff : DAG->getPressureDiff(SU)) { + if (!Diff.isValid()) + continue; + // PressureDiffs is always bottom-up so if we're working top-down we need + // to invert its sign. + Pressure[Diff.getPSet()] += + (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc()); + } + +#ifdef EXPENSIVE_CHECKS + std::vector<unsigned> CheckPressure, CheckMaxPressure; + getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure); + if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != + CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || + Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != + CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) { + errs() << "Register Pressure is inaccurate when calculated through " + "PressureDiff\n" + << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32] + << ", expected " + << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n" + << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32] + << ", expected " + << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n"; + report_fatal_error("inaccurate register pressure calculation"); + } +#endif } unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; @@ -158,7 +239,6 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; - // FIXME: We have to enter REG-EXCESS before we reach the actual threshold // to increase the likelihood we don't go over the limits. We should improve // the analysis to look through dependencies to find the path with the least @@ -207,7 +287,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, - SchedCandidate &Cand) { + SchedCandidate &Cand, + bool IsBottomUp) { const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); unsigned SGPRPressure = 0; @@ -220,8 +301,8 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, for (SUnit *SU : Q) { SchedCandidate TryCand(ZonePolicy); - initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, - SGPRPressure, VGPRPressure); + initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, + VGPRPressure, IsBottomUp); // Pass SchedBoundary only when comparing nodes from the same boundary. SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; tryCandidate(Cand, TryCand, ZoneArg); @@ -262,7 +343,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { if (!BotCand.isValid() || BotCand.SU->isScheduled || BotCand.Policy != BotPolicy) { BotCand.reset(CandPolicy()); - pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); + pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand, + /*IsBottomUp=*/true); assert(BotCand.Reason != NoCand && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(BotCand)); @@ -270,7 +352,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { if (VerifyScheduling) { SchedCandidate TCand; TCand.reset(CandPolicy()); - pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand); + pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, + /*IsBottomUp=*/true); assert(TCand.SU == BotCand.SU && "Last pick result should correspond to re-picking right now"); } @@ -282,7 +365,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { if (!TopCand.isValid() || TopCand.SU->isScheduled || TopCand.Policy != TopPolicy) { TopCand.reset(CandPolicy()); - pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); + pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand, + /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(TopCand)); @@ -290,7 +374,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { if (VerifyScheduling) { SchedCandidate TCand; TCand.reset(CandPolicy()); - pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand); + pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, + /*IsBottomUp=*/false); assert(TCand.SU == TopCand.SU && "Last pick result should correspond to re-picking right now"); } @@ -327,7 +412,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { if (!SU) { CandPolicy NoPolicy; TopCand.reset(NoPolicy); - pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); + pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand, + /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find a candidate"); SU = TopCand.SU; } @@ -337,7 +423,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { if (!SU) { CandPolicy NoPolicy; BotCand.reset(NoPolicy); - pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); + pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand, + /*IsBottomUp=*/true); assert(BotCand.Reason != NoCand && "failed to find a candidate"); SU = BotCand.SU; } diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 2084aae4128f..f0aea2bc4ab8 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -45,12 +45,12 @@ protected: void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, - SchedCandidate &Cand); + SchedCandidate &Cand, bool IsBottomUp); - void initCandidate(SchedCandidate &Cand, SUnit *SU, - bool AtTop, const RegPressureTracker &RPTracker, - const SIRegisterInfo *SRI, - unsigned SGPRPressure, unsigned VGPRPressure); + void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, + const RegPressureTracker &RPTracker, + const SIRegisterInfo *SRI, unsigned SGPRPressure, + unsigned VGPRPressure, bool IsBottomUp); std::vector<unsigned> Pressure; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index db5b467f2238..07ff855756ec 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -159,6 +159,10 @@ protected: bool HasFP8Insts = false; bool HasFP8ConversionInsts = false; bool HasPkFmacF16Inst = false; + bool HasAtomicFMinFMaxF32GlobalInsts = false; + bool HasAtomicFMinFMaxF64GlobalInsts = false; + bool HasAtomicFMinFMaxF32FlatInsts = false; + bool HasAtomicFMinFMaxF64FlatInsts = false; bool HasAtomicDsPkAdd16Insts = false; bool HasAtomicFlatPkAdd16Insts = false; bool HasAtomicFaddRtnInsts = false; @@ -167,6 +171,7 @@ protected: bool HasAtomicBufferGlobalPkAddF16Insts = false; bool HasAtomicCSubNoRtnInsts = false; bool HasAtomicGlobalPkAddBF16Inst = false; + bool HasAtomicBufferPkAddBF16Inst = false; bool HasFlatAtomicFaddF32Inst = false; bool HasDefaultComponentZero = false; bool HasDefaultComponentBroadcast = false; @@ -820,6 +825,22 @@ public: return HasPkFmacF16Inst; } + bool hasAtomicFMinFMaxF32GlobalInsts() const { + return HasAtomicFMinFMaxF32GlobalInsts; + } + + bool hasAtomicFMinFMaxF64GlobalInsts() const { + return HasAtomicFMinFMaxF64GlobalInsts; + } + + bool hasAtomicFMinFMaxF32FlatInsts() const { + return HasAtomicFMinFMaxF32FlatInsts; + } + + bool hasAtomicFMinFMaxF64FlatInsts() const { + return HasAtomicFMinFMaxF64FlatInsts; + } + bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; } bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; } @@ -844,6 +865,10 @@ public: return HasAtomicGlobalPkAddBF16Inst; } + bool hasAtomicBufferPkAddBF16Inst() const { + return HasAtomicBufferPkAddBF16Inst; + } + bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } bool hasDefaultComponentZero() const { return HasDefaultComponentZero; } @@ -1547,6 +1572,8 @@ public: bool hasFlatScratchInit() const { return FlatScratchInit; } + bool hasPrivateSegmentSize() const { return PrivateSegmentSize; } + unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; } unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; } @@ -1611,6 +1638,8 @@ private: bool FlatScratchInit = false; + bool PrivateSegmentSize = false; + unsigned NumKernargPreloadSGPRs = 0; unsigned NumUsedUserSGPRs = 0; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 883b6c4407fe..bb5de368810d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -43,7 +43,6 @@ void AMDGPUInstPrinter::printRegName(raw_ostream &OS, MCRegister Reg) const { void AMDGPUInstPrinter::printInst(const MCInst *MI, uint64_t Address, StringRef Annot, const MCSubtargetInfo &STI, raw_ostream &OS) { - OS.flush(); printInstruction(MI, Address, STI, OS); printAnnotation(OS, Annot); } @@ -57,9 +56,15 @@ void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isExpr()) { + Op.getExpr()->print(O, &MAI); + return; + } + // It's possible to end up with a 32-bit literal used with a 16-bit operand // with ignored high bits. Print as 32-bit anyway in that case. - int64_t Imm = MI->getOperand(OpNo).getImm(); + int64_t Imm = Op.getImm(); if (isInt<16>(Imm) || isUInt<16>(Imm)) O << formatHex(static_cast<uint64_t>(Imm & 0xffff)); else diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index fb93f45e3e87..b3cca91f6380 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -662,6 +662,11 @@ void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128( void AMDGPUMCCodeEmitter::getMachineOpValueCommon( const MCInst &MI, const MCOperand &MO, unsigned OpNo, APInt &Op, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { + int64_t Val; + if (MO.isExpr() && MO.getExpr()->evaluateAsAbsolute(Val)) { + Op = Val; + return; + } if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) { // FIXME: If this is expression is PCRel or not should not depend on what diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp index 159664faf983..83fbf4ac53d5 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp @@ -21,13 +21,11 @@ using namespace llvm; using namespace llvm::AMDGPU; -AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind, - ArrayRef<const MCExpr *> Args, - MCContext &Ctx) +AMDGPUMCExpr::AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, + MCContext &Ctx) : Kind(Kind), Ctx(Ctx) { assert(Args.size() >= 1 && "Needs a minimum of one expression."); - assert(Kind != AGVK_None && - "Cannot construct AMDGPUVariadicMCExpr of kind none."); + assert(Kind != AGVK_None && "Cannot construct AMDGPUMCExpr of kind none."); // Allocating the variadic arguments through the same allocation mechanism // that the object itself is allocated with so they end up in the same memory. @@ -40,25 +38,23 @@ AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind, this->Args = ArrayRef<const MCExpr *>(RawArgs, Args.size()); } -AMDGPUVariadicMCExpr::~AMDGPUVariadicMCExpr() { Ctx.deallocate(RawArgs); } +AMDGPUMCExpr::~AMDGPUMCExpr() { Ctx.deallocate(RawArgs); } -const AMDGPUVariadicMCExpr * -AMDGPUVariadicMCExpr::create(VariadicKind Kind, ArrayRef<const MCExpr *> Args, - MCContext &Ctx) { - return new (Ctx) AMDGPUVariadicMCExpr(Kind, Args, Ctx); +const AMDGPUMCExpr *AMDGPUMCExpr::create(VariantKind Kind, + ArrayRef<const MCExpr *> Args, + MCContext &Ctx) { + return new (Ctx) AMDGPUMCExpr(Kind, Args, Ctx); } -const MCExpr *AMDGPUVariadicMCExpr::getSubExpr(size_t Index) const { - assert(Index < Args.size() && - "Indexing out of bounds AMDGPUVariadicMCExpr sub-expr"); +const MCExpr *AMDGPUMCExpr::getSubExpr(size_t Index) const { + assert(Index < Args.size() && "Indexing out of bounds AMDGPUMCExpr sub-expr"); return Args[Index]; } -void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS, - const MCAsmInfo *MAI) const { +void AMDGPUMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { switch (Kind) { default: - llvm_unreachable("Unknown AMDGPUVariadicMCExpr kind."); + llvm_unreachable("Unknown AMDGPUMCExpr kind."); case AGVK_Or: OS << "or("; break; @@ -86,21 +82,19 @@ void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS, OS << ')'; } -static int64_t op(AMDGPUVariadicMCExpr::VariadicKind Kind, int64_t Arg1, - int64_t Arg2) { +static int64_t op(AMDGPUMCExpr::VariantKind Kind, int64_t Arg1, int64_t Arg2) { switch (Kind) { default: - llvm_unreachable("Unknown AMDGPUVariadicMCExpr kind."); - case AMDGPUVariadicMCExpr::AGVK_Max: + llvm_unreachable("Unknown AMDGPUMCExpr kind."); + case AMDGPUMCExpr::AGVK_Max: return std::max(Arg1, Arg2); - case AMDGPUVariadicMCExpr::AGVK_Or: + case AMDGPUMCExpr::AGVK_Or: return Arg1 | Arg2; } } -bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res, - const MCAsmLayout *Layout, - const MCFixup *Fixup) const { +bool AMDGPUMCExpr::evaluateExtraSGPRs(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const { auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) { MCValue MCVal; if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) || @@ -112,7 +106,7 @@ bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res, }; assert(Args.size() == 3 && - "AMDGPUVariadic Argument count incorrect for ExtraSGPRs"); + "AMDGPUMCExpr Argument count incorrect for ExtraSGPRs"); const MCSubtargetInfo *STI = Ctx.getSubtargetInfo(); uint64_t VCCUsed = 0, FlatScrUsed = 0, XNACKUsed = 0; @@ -129,9 +123,8 @@ bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res, return true; } -bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res, - const MCAsmLayout *Layout, - const MCFixup *Fixup) const { +bool AMDGPUMCExpr::evaluateTotalNumVGPR(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const { auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) { MCValue MCVal; if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) || @@ -142,7 +135,7 @@ bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res, return true; }; assert(Args.size() == 2 && - "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs"); + "AMDGPUMCExpr Argument count incorrect for TotalNumVGPRs"); const MCSubtargetInfo *STI = Ctx.getSubtargetInfo(); uint64_t NumAGPR = 0, NumVGPR = 0; @@ -158,9 +151,8 @@ bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res, return true; } -bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res, - const MCAsmLayout *Layout, - const MCFixup *Fixup) const { +bool AMDGPUMCExpr::evaluateAlignTo(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const { auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) { MCValue MCVal; if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) || @@ -172,7 +164,7 @@ bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res, }; assert(Args.size() == 2 && - "AMDGPUVariadic Argument count incorrect for AlignTo"); + "AMDGPUMCExpr Argument count incorrect for AlignTo"); uint64_t Value = 0, Align = 0; if (!TryGetMCExprValue(Args[0], Value) || !TryGetMCExprValue(Args[1], Align)) return false; @@ -181,9 +173,8 @@ bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res, return true; } -bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res, - const MCAsmLayout *Layout, - const MCFixup *Fixup) const { +bool AMDGPUMCExpr::evaluateOccupancy(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const { auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) { MCValue MCVal; if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) || @@ -194,7 +185,7 @@ bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res, return true; }; assert(Args.size() == 7 && - "AMDGPUVariadic Argument count incorrect for Occupancy"); + "AMDGPUMCExpr Argument count incorrect for Occupancy"); uint64_t InitOccupancy, MaxWaves, Granule, TargetTotalNumVGPRs, Generation, NumSGPRs, NumVGPRs; @@ -226,8 +217,9 @@ bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res, return true; } -bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl( - MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const { +bool AMDGPUMCExpr::evaluateAsRelocatableImpl(MCValue &Res, + const MCAsmLayout *Layout, + const MCFixup *Fixup) const { std::optional<int64_t> Total; switch (Kind) { @@ -258,12 +250,12 @@ bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl( return true; } -void AMDGPUVariadicMCExpr::visitUsedExpr(MCStreamer &Streamer) const { +void AMDGPUMCExpr::visitUsedExpr(MCStreamer &Streamer) const { for (const MCExpr *Arg : Args) Streamer.visitUsedExpr(*Arg); } -MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const { +MCFragment *AMDGPUMCExpr::findAssociatedFragment() const { for (const MCExpr *Arg : Args) { if (Arg->findAssociatedFragment()) return Arg->findAssociatedFragment(); @@ -275,18 +267,19 @@ MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const { /// are unresolvable but needed for further MCExprs). Derived from /// implementation of IsaInfo::getNumExtraSGPRs in AMDGPUBaseInfo.cpp. /// -const AMDGPUVariadicMCExpr * -AMDGPUVariadicMCExpr::createExtraSGPRs(const MCExpr *VCCUsed, - const MCExpr *FlatScrUsed, - bool XNACKUsed, MCContext &Ctx) { +const AMDGPUMCExpr *AMDGPUMCExpr::createExtraSGPRs(const MCExpr *VCCUsed, + const MCExpr *FlatScrUsed, + bool XNACKUsed, + MCContext &Ctx) { return create(AGVK_ExtraSGPRs, {VCCUsed, FlatScrUsed, MCConstantExpr::create(XNACKUsed, Ctx)}, Ctx); } -const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR( - const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx) { +const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR, + const MCExpr *NumVGPR, + MCContext &Ctx) { return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx); } @@ -295,10 +288,11 @@ const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR( /// Remove dependency on GCNSubtarget and depend only only the necessary values /// for said occupancy computation. Should match computeOccupancy implementation /// without passing \p STM on. -const AMDGPUVariadicMCExpr * -AMDGPUVariadicMCExpr::createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, - const MCExpr *NumVGPRs, - const GCNSubtarget &STM, MCContext &Ctx) { +const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(unsigned InitOcc, + const MCExpr *NumSGPRs, + const MCExpr *NumVGPRs, + const GCNSubtarget &STM, + MCContext &Ctx) { unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM); unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM); unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index f92350b59235..207a619d45a1 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -17,7 +17,7 @@ namespace llvm { class Function; class GCNSubtarget; -/// AMDGPU target specific variadic MCExpr operations. +/// AMDGPU target specific MCExpr operations. /// /// Takes in a minimum of 1 argument to be used with an operation. The supported /// operations are: @@ -27,9 +27,9 @@ class GCNSubtarget; /// \note If the 'or'/'max' operations are provided only a single argument, the /// operation will act as a no-op and simply resolve as the provided argument. /// -class AMDGPUVariadicMCExpr : public MCTargetExpr { +class AMDGPUMCExpr : public MCTargetExpr { public: - enum VariadicKind { + enum VariantKind { AGVK_None, AGVK_Or, AGVK_Max, @@ -40,14 +40,13 @@ public: }; private: - VariadicKind Kind; + VariantKind Kind; MCContext &Ctx; const MCExpr **RawArgs; ArrayRef<const MCExpr *> Args; - AMDGPUVariadicMCExpr(VariadicKind Kind, ArrayRef<const MCExpr *> Args, - MCContext &Ctx); - ~AMDGPUVariadicMCExpr(); + AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx); + ~AMDGPUMCExpr(); bool evaluateExtraSGPRs(MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const; @@ -59,40 +58,39 @@ private: const MCFixup *Fixup) const; public: - static const AMDGPUVariadicMCExpr * - create(VariadicKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx); + static const AMDGPUMCExpr * + create(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx); - static const AMDGPUVariadicMCExpr *createOr(ArrayRef<const MCExpr *> Args, - MCContext &Ctx) { - return create(VariadicKind::AGVK_Or, Args, Ctx); + static const AMDGPUMCExpr *createOr(ArrayRef<const MCExpr *> Args, + MCContext &Ctx) { + return create(VariantKind::AGVK_Or, Args, Ctx); } - static const AMDGPUVariadicMCExpr *createMax(ArrayRef<const MCExpr *> Args, - MCContext &Ctx) { - return create(VariadicKind::AGVK_Max, Args, Ctx); + static const AMDGPUMCExpr *createMax(ArrayRef<const MCExpr *> Args, + MCContext &Ctx) { + return create(VariantKind::AGVK_Max, Args, Ctx); } - static const AMDGPUVariadicMCExpr *createExtraSGPRs(const MCExpr *VCCUsed, - const MCExpr *FlatScrUsed, - bool XNACKUsed, - MCContext &Ctx); + static const AMDGPUMCExpr *createExtraSGPRs(const MCExpr *VCCUsed, + const MCExpr *FlatScrUsed, + bool XNACKUsed, MCContext &Ctx); - static const AMDGPUVariadicMCExpr *createTotalNumVGPR(const MCExpr *NumAGPR, - const MCExpr *NumVGPR, - MCContext &Ctx); + static const AMDGPUMCExpr *createTotalNumVGPR(const MCExpr *NumAGPR, + const MCExpr *NumVGPR, + MCContext &Ctx); - static const AMDGPUVariadicMCExpr * + static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx) { - return create(VariadicKind::AGVK_AlignTo, {Value, Align}, Ctx); + return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx); } - static const AMDGPUVariadicMCExpr *createOccupancy(unsigned InitOcc, - const MCExpr *NumSGPRs, - const MCExpr *NumVGPRs, - const GCNSubtarget &STM, - MCContext &Ctx); + static const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, + const MCExpr *NumSGPRs, + const MCExpr *NumVGPRs, + const GCNSubtarget &STM, + MCContext &Ctx); - VariadicKind getKind() const { return Kind; } + VariantKind getKind() const { return Kind; } const MCExpr *getSubExpr(size_t Index) const; void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index e805e964ffe4..531031b58034 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -319,8 +319,9 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, - const MCKernelDescriptor &KD, uint64_t NextVGPR, uint64_t NextSGPR, - bool ReserveVCC, bool ReserveFlatScr) { + const MCKernelDescriptor &KD, const MCExpr *NextVGPR, + const MCExpr *NextSGPR, const MCExpr *ReserveVCC, + const MCExpr *ReserveFlatScr) { IsaVersion IVersion = getIsaVersion(STI.getCPU()); const MCAsmInfo *MAI = getContext().getAsmInfo(); @@ -339,16 +340,25 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( OS << '\n'; }; + auto EmitMCExpr = [&](const MCExpr *Value) { + int64_t evaluatableValue; + if (Value->evaluateAsAbsolute(evaluatableValue)) { + OS << static_cast<uint64_t>(evaluatableValue); + } else { + Value->print(OS, MAI); + } + }; + OS << "\t\t.amdhsa_group_segment_fixed_size "; - KD.group_segment_fixed_size->print(OS, MAI); + EmitMCExpr(KD.group_segment_fixed_size); OS << '\n'; OS << "\t\t.amdhsa_private_segment_fixed_size "; - KD.private_segment_fixed_size->print(OS, MAI); + EmitMCExpr(KD.private_segment_fixed_size); OS << '\n'; OS << "\t\t.amdhsa_kernarg_size "; - KD.kernarg_size->print(OS, MAI); + EmitMCExpr(KD.kernarg_size); OS << '\n'; PrintField( @@ -433,8 +443,13 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( ".amdhsa_system_vgpr_workitem_id"); // These directives are required. - OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n'; - OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n'; + OS << "\t\t.amdhsa_next_free_vgpr "; + EmitMCExpr(NextVGPR); + OS << '\n'; + + OS << "\t\t.amdhsa_next_free_sgpr "; + EmitMCExpr(NextSGPR); + OS << '\n'; if (AMDGPU::isGFX90A(STI)) { // MCExpr equivalent of taking the (accum_offset + 1) * 4. @@ -447,19 +462,19 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( accum_bits = MCBinaryExpr::createMul( accum_bits, MCConstantExpr::create(4, getContext()), getContext()); OS << "\t\t.amdhsa_accum_offset "; - int64_t IVal; - if (accum_bits->evaluateAsAbsolute(IVal)) { - OS << static_cast<uint64_t>(IVal); - } else { - accum_bits->print(OS, MAI); - } + EmitMCExpr(accum_bits); OS << '\n'; } - if (!ReserveVCC) - OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n'; - if (IVersion.Major >= 7 && !ReserveFlatScr && !hasArchitectedFlatScratch(STI)) - OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n'; + OS << "\t\t.amdhsa_reserve_vcc "; + EmitMCExpr(ReserveVCC); + OS << '\n'; + + if (IVersion.Major >= 7 && !hasArchitectedFlatScratch(STI)) { + OS << "\t\t.amdhsa_reserve_flat_scratch "; + EmitMCExpr(ReserveFlatScr); + OS << '\n'; + } switch (CodeObjectVersion) { default: @@ -915,8 +930,9 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, - const MCKernelDescriptor &KernelDescriptor, uint64_t NextVGPR, - uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) { + const MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, + const MCExpr *NextSGPR, const MCExpr *ReserveVCC, + const MCExpr *ReserveFlatScr) { auto &Streamer = getStreamer(); auto &Context = Streamer.getContext(); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index e5c90060cb5d..bf1538c71d15 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -94,8 +94,9 @@ public: virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, - uint64_t NextVGPR, uint64_t NextSGPR, - bool ReserveVCC, bool ReserveFlatScr) {} + const MCExpr *NextVGPR, const MCExpr *NextSGPR, + const MCExpr *ReserveVCC, + const MCExpr *ReserveFlatScr) {} static StringRef getArchNameFromElfMach(unsigned ElfMach); static unsigned getElfMach(StringRef GPU); @@ -151,8 +152,9 @@ public: void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, - uint64_t NextVGPR, uint64_t NextSGPR, - bool ReserveVCC, bool ReserveFlatScr) override; + const MCExpr *NextVGPR, const MCExpr *NextSGPR, + const MCExpr *ReserveVCC, + const MCExpr *ReserveFlatScr) override; }; class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { @@ -207,8 +209,9 @@ public: void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, - uint64_t NextVGPR, uint64_t NextSGPR, - bool ReserveVCC, bool ReserveFlatScr) override; + const MCExpr *NextVGPR, const MCExpr *NextSGPR, + const MCExpr *ReserveVCC, + const MCExpr *ReserveFlatScr) override; }; } #endif diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp index 22d0594e2b86..56a23e26b8d9 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp @@ -21,7 +21,6 @@ using namespace llvm; void R600InstPrinter::printInst(const MCInst *MI, uint64_t Address, StringRef Annot, const MCSubtargetInfo &STI, raw_ostream &O) { - O.flush(); printInstruction(MI, Address, O); printAnnotation(O, Annot); } diff --git a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp index 0a96c643d9bd..1a73fdf028c9 100644 --- a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp @@ -113,8 +113,8 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); - AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addRequired<MachinePostDominatorTreeWrapperPass>(); AU.addRequired<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -140,9 +140,9 @@ public: FuncRep = &MF; MLI = &getAnalysis<MachineLoopInfo>(); LLVM_DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); - MDT = &getAnalysis<MachineDominatorTree>(); - LLVM_DEBUG(MDT->print(dbgs(), (const Module *)nullptr);); - PDT = &getAnalysis<MachinePostDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); + LLVM_DEBUG(MDT->print(dbgs());); + PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); LLVM_DEBUG(PDT->print(dbgs());); prepare(); run(); @@ -1629,8 +1629,8 @@ void R600MachineCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { INITIALIZE_PASS_BEGIN(R600MachineCFGStructurizer, "amdgpustructurizer", "AMDGPU CFG Structurizer", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(R600MachineCFGStructurizer, "amdgpustructurizer", "AMDGPU CFG Structurizer", false, false) diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 77935cb4cde1..8bac570d59d4 100644 --- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -103,8 +103,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineLoopInfo>(); AU.addPreserved<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp index 59e274787590..64185db02ec1 100644 --- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -35,8 +35,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineLoopInfo>(); AU.addPreserved<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index a00ca625fc73..68c5f23c8e11 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -162,8 +162,8 @@ public: StringRef getPassName() const override { return "SI Fix SGPR copies"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -173,7 +173,7 @@ public: INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) @@ -611,8 +611,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); TRI = ST.getRegisterInfo(); TII = ST.getInstrInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); - + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 5c411a095587..7bf6a635158e 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1519,6 +1519,9 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { case AMDGPU::V_MAX_F64_e64: case AMDGPU::V_MAX_NUM_F64_e64: case AMDGPU::V_PK_MAX_F16: { + if (MI.mayRaiseFPException()) + return nullptr; + if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) return nullptr; @@ -1565,6 +1568,9 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { if (TII->getClampMask(*Def) != TII->getClampMask(MI)) return false; + if (Def->mayRaiseFPException()) + return false; + MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp); if (!DefClamp) return false; @@ -1650,7 +1656,9 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 || Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 || Op == AMDGPU::V_MUL_F16_fake16_e64) && - MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign)) + MFI->getMode().FP64FP16Denormals.Output != + DenormalMode::PreserveSign) || + MI.mayRaiseFPException()) return std::pair(nullptr, SIOutMods::NONE); const MachineOperand *RegOp = nullptr; @@ -1725,6 +1733,9 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE) return false; + if (Def->mayRaiseFPException()) + return false; + // Clamp is applied after omod. If the source already has clamp set, don't // fold it. if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp)) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4d8667affdb4..83bfb622ee52 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -791,8 +791,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16}) // Split vector operations. setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB, - ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, - ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT, + ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, + ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT, ISD::SSUBSAT}, VT, Custom); @@ -859,19 +859,22 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, - MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8}, + MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128, + MVT::i8}, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, - {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16, - MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16, - MVT::i16, MVT::i8, MVT::i128}, + {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16, + MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16, + MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16, + MVT::i16, MVT::bf16, MVT::i8, MVT::i128}, Custom); setOperationAction(ISD::INTRINSIC_VOID, - {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16, - MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16, - MVT::i8, MVT::i128}, + {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16, + MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16, + MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, + MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128}, Custom); setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); @@ -942,6 +945,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::ATOMIC_LOAD_UMIN, ISD::ATOMIC_LOAD_UMAX, ISD::ATOMIC_LOAD_FADD, + ISD::ATOMIC_LOAD_FMIN, + ISD::ATOMIC_LOAD_FMAX, ISD::ATOMIC_LOAD_UINC_WRAP, ISD::ATOMIC_LOAD_UDEC_WRAP, ISD::INTRINSIC_VOID, @@ -1109,29 +1114,33 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); } -static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) { +static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, + const DataLayout &DL, Type *Ty, + unsigned MaxNumLanes) { assert(MaxNumLanes != 0); + LLVMContext &Ctx = Ty->getContext(); if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements()); - return EVT::getVectorVT(Ty->getContext(), - EVT::getEVT(VT->getElementType()), + return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()), NumElts); } - return EVT::getEVT(Ty); + return TLI.getValueType(DL, Ty); } // Peek through TFE struct returns to only use the data size. -static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) { +static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, + const DataLayout &DL, Type *Ty, + unsigned MaxNumLanes) { auto *ST = dyn_cast<StructType>(Ty); if (!ST) - return memVTFromLoadIntrData(Ty, MaxNumLanes); + return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes); // TFE intrinsics return an aggregate type. assert(ST->getNumContainedTypes() == 2 && ST->getContainedType(1)->isIntegerTy(32)); - return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes); + return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes); } /// Map address space 7 to MVT::v5i32 because that's its in-memory @@ -1200,9 +1209,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOVolatile; Info.flags |= MachineMemOperand::MODereferenceable; if (ME.onlyReadsMemory()) { - unsigned MaxNumLanes = 4; - if (RsrcIntr->IsImage) { + unsigned MaxNumLanes = 4; + const AMDGPU::ImageDimIntrinsicInfo *Intr = AMDGPU::getImageDimIntrinsicInfo(IntrID); const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = @@ -1215,9 +1224,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue(); MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask); } - } - Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes); + Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(), + CI.getType(), MaxNumLanes); + } else { + Info.memVT = + memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(), + std::numeric_limits<unsigned>::max()); + } // FIXME: What does alignment mean for an image? Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1229,9 +1243,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, if (RsrcIntr->IsImage) { unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue(); unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask); - Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes); + Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy, + DMaskLanes); } else - Info.memVT = EVT::getEVT(DataTy); + Info.memVT = getValueType(MF.getDataLayout(), DataTy); Info.flags |= MachineMemOperand::MOStore; } else { @@ -1265,7 +1280,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, switch (IntrID) { case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1280,19 +1294,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } - case Intrinsic::amdgcn_buffer_atomic_fadd: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); - Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; - Info.align.reset(); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; - - const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); - if (!Vol || !Vol->isZero()) - Info.flags |= MachineMemOperand::MOVolatile; - - return true; - } case Intrinsic::amdgcn_ds_add_gs_reg_rtn: case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: { Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1449,7 +1450,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmax: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_ordered_add: @@ -1610,6 +1610,16 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return false; } + if ((AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && + AM.BaseOffs < 0) { + // Scalar (non-buffer) loads can only use a negative offset if + // soffset+offset is non-negative. Since the compiler can only prove that + // in a few special cases, it is safer to claim that negative offsets are + // not supported. + return false; + } + if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. return true; @@ -2468,6 +2478,12 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, CCInfo.AllocateReg(FlatScratchInitReg); } + if (UserSGPRInfo.hasPrivateSegmentSize()) { + Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI); + MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(PrivateSegmentSizeReg); + } + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read // these from the dispatch pointer. } @@ -5811,6 +5827,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerTRAP(Op, DAG); case ISD::DEBUGTRAP: return lowerDEBUGTRAP(Op, DAG); + case ISD::ABS: case ISD::FABS: case ISD::FNEG: case ISD::FCANONICALIZE: @@ -6097,6 +6114,184 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE)); } +static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + unsigned ValSize = VT.getSizeInBits(); + unsigned IID = N->getConstantOperandVal(0); + bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || + IID == Intrinsic::amdgcn_permlanex16; + SDLoc SL(N); + MVT IntVT = MVT::getIntegerVT(ValSize); + + auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1, + SDValue Src2, MVT ValT) -> SDValue { + SmallVector<SDValue, 8> Operands; + switch (IID) { + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: + Operands.push_back(N->getOperand(6)); + Operands.push_back(N->getOperand(5)); + Operands.push_back(N->getOperand(4)); + [[fallthrough]]; + case Intrinsic::amdgcn_writelane: + Operands.push_back(Src2); + [[fallthrough]]; + case Intrinsic::amdgcn_readlane: + Operands.push_back(Src1); + [[fallthrough]]; + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_permlane64: + Operands.push_back(Src0); + break; + default: + llvm_unreachable("unhandled lane op"); + } + + Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32)); + std::reverse(Operands.begin(), Operands.end()); + + if (SDNode *GL = N->getGluedNode()) { + assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); + GL = GL->getOperand(0).getNode(); + Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue, + SDValue(GL, 0))); + } + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands); + }; + + SDValue Src0 = N->getOperand(1); + SDValue Src1, Src2; + if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || + IsPermLane16) { + Src1 = N->getOperand(2); + if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) + Src2 = N->getOperand(3); + } + + if (ValSize == 32) { + // Already legal + return SDValue(); + } + + if (ValSize < 32) { + bool IsFloat = VT.isFloatingPoint(); + Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0, + SL, MVT::i32); + + if (IsPermLane16) { + Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1, + SL, MVT::i32); + } + + if (IID == Intrinsic::amdgcn_writelane) { + Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2, + SL, MVT::i32); + } + + SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32); + SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT); + return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc; + } + + if (ValSize % 32 != 0) + return SDValue(); + + auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue { + EVT VT = N->getValueType(0); + unsigned NE = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + SmallVector<SDValue, 8> Scalars; + unsigned NumOperands = N->getNumOperands(); + SmallVector<SDValue, 4> Operands(NumOperands); + SDNode *GL = N->getGluedNode(); + + // only handle convergencectrl_glue + assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); + + for (unsigned i = 0; i != NE; ++i) { + for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e; + ++j) { + SDValue Operand = N->getOperand(j); + EVT OperandVT = Operand.getValueType(); + if (OperandVT.isVector()) { + // A vector operand; extract a single element. + EVT OperandEltVT = OperandVT.getVectorElementType(); + Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT, + Operand, DAG.getVectorIdxConstant(i, SL)); + } else { + // A scalar operand; just use it as is. + Operands[j] = Operand; + } + } + + if (GL) + Operands[NumOperands - 1] = + DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue, + SDValue(GL->getOperand(0).getNode(), 0)); + + Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands)); + } + + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE); + return DAG.getBuildVector(VecVT, SL, Scalars); + }; + + if (VT.isVector()) { + switch (MVT::SimpleValueType EltTy = + VT.getVectorElementType().getSimpleVT().SimpleTy) { + case MVT::i32: + case MVT::f32: { + SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT()); + return unrollLaneOp(LaneOp.getNode()); + } + case MVT::i16: + case MVT::f16: + case MVT::bf16: { + MVT SubVecVT = MVT::getVectorVT(EltTy, 2); + SmallVector<SDValue, 4> Pieces; + SDValue Src0SubVec, Src1SubVec, Src2SubVec; + for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) { + Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0, + DAG.getConstant(EltIdx, SL, MVT::i32)); + + if (IsPermLane16) + Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1, + DAG.getConstant(EltIdx, SL, MVT::i32)); + + if (IID == Intrinsic::amdgcn_writelane) + Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2, + DAG.getConstant(EltIdx, SL, MVT::i32)); + + Pieces.push_back( + IsPermLane16 + ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT) + : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT)); + EltIdx += 2; + } + return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces); + } + default: + // Handle all other cases by bitcasting to i32 vectors + break; + } + } + + MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32); + Src0 = DAG.getBitcast(VecVT, Src0); + + if (IsPermLane16) + Src1 = DAG.getBitcast(VecVT, Src1); + + if (IID == Intrinsic::amdgcn_writelane) + Src2 = DAG.getBitcast(VecVT, Src2); + + SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT); + SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode()); + return DAG.getBitcast(VT, UnrolledLaneOp); +} + void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { @@ -8563,6 +8758,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::amdgcn_addrspacecast_nonnull: return lowerADDRSPACECAST(Op, DAG); + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_writelane: + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: + case Intrinsic::amdgcn_permlane64: + return lowerLaneOp(*this, Op.getNode(), DAG); default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -8609,12 +8811,6 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, M->getMemOperand()); } -// Return a value to use for the idxen operand by examining the vindex operand. -static unsigned getIdxEn(SDValue VIndex) { - // No need to set idxen if vindex is known to be zero. - return isNullConstant(VIndex) ? 0 : 1; -} - SDValue SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, unsigned NewOpcode) const { @@ -8703,78 +8899,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, M->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_ds_fadd: { - MemSDNode *M = cast<MemSDNode>(Op); - unsigned Opc; - switch (IntrID) { - case Intrinsic::amdgcn_ds_fadd: - Opc = ISD::ATOMIC_LOAD_FADD; - break; - } - - return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), - M->getOperand(0), M->getOperand(2), M->getOperand(3), - M->getMemOperand()); - } case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { MemSDNode *M = cast<MemSDNode>(Op); - unsigned Opc; - switch (IntrID) { - case Intrinsic::amdgcn_ds_fmin: - Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; - break; - case Intrinsic::amdgcn_ds_fmax: - Opc = AMDGPUISD::ATOMIC_LOAD_FMAX; - break; - default: - llvm_unreachable("Unknown intrinsic!"); - } - SDValue Ops[] = { - M->getOperand(0), // Chain - M->getOperand(2), // Ptr - M->getOperand(3) // Value - }; - - return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, - M->getMemoryVT(), M->getMemOperand()); - } - case Intrinsic::amdgcn_buffer_load: - case Intrinsic::amdgcn_buffer_load_format: { - unsigned Glc = Op.getConstantOperandVal(5); - unsigned Slc = Op.getConstantOperandVal(6); - unsigned IdxEn = getIdxEn(Op.getOperand(3)); - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); - - unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); - auto *M = cast<MemSDNode>(Op); - EVT LoadVT = Op.getValueType(); - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - - // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, - M->getMemOperand()); - - return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand(), DAG); + unsigned Opc = IntrID == Intrinsic::amdgcn_ds_fmin ? ISD::ATOMIC_LOAD_FMIN + : ISD::ATOMIC_LOAD_FMAX; + return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), M->getOperand(0), + M->getOperand(2), M->getOperand(3), + M->getMemOperand()); } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_ptr_buffer_load: @@ -8825,35 +8957,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops); } - case Intrinsic::amdgcn_tbuffer_load: { - MemSDNode *M = cast<MemSDNode>(Op); - EVT LoadVT = Op.getValueType(); - - auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); - unsigned Dfmt = Op.getConstantOperandVal(7); - unsigned Nfmt = Op.getConstantOperandVal(8); - unsigned Glc = Op.getConstantOperandVal(9); - unsigned Slc = Op.getConstantOperandVal(10); - unsigned IdxEn = getIdxEn(Op.getOperand(3)); - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // voffset - SOffset, // soffset - Op.getOperand(6), // offset - DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen - }; - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, M->getMemOperand(), - DAG); - } case Intrinsic::amdgcn_raw_tbuffer_load: case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); @@ -8908,94 +9011,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op->getVTList(), Ops, LoadVT, M->getMemOperand(), DAG); } - case Intrinsic::amdgcn_buffer_atomic_swap: - case Intrinsic::amdgcn_buffer_atomic_add: - case Intrinsic::amdgcn_buffer_atomic_sub: - case Intrinsic::amdgcn_buffer_atomic_csub: - case Intrinsic::amdgcn_buffer_atomic_smin: - case Intrinsic::amdgcn_buffer_atomic_umin: - case Intrinsic::amdgcn_buffer_atomic_smax: - case Intrinsic::amdgcn_buffer_atomic_umax: - case Intrinsic::amdgcn_buffer_atomic_and: - case Intrinsic::amdgcn_buffer_atomic_or: - case Intrinsic::amdgcn_buffer_atomic_xor: - case Intrinsic::amdgcn_buffer_atomic_fadd: { - unsigned Slc = Op.getConstantOperandVal(6); - unsigned IdxEn = getIdxEn(Op.getOperand(4)); - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); - - EVT VT = Op.getValueType(); - - auto *M = cast<MemSDNode>(Op); - unsigned Opcode = 0; - - switch (IntrID) { - case Intrinsic::amdgcn_buffer_atomic_swap: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP; - break; - case Intrinsic::amdgcn_buffer_atomic_add: - Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD; - break; - case Intrinsic::amdgcn_buffer_atomic_sub: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; - break; - case Intrinsic::amdgcn_buffer_atomic_csub: - Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB; - break; - case Intrinsic::amdgcn_buffer_atomic_smin: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; - break; - case Intrinsic::amdgcn_buffer_atomic_umin: - Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN; - break; - case Intrinsic::amdgcn_buffer_atomic_smax: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX; - break; - case Intrinsic::amdgcn_buffer_atomic_umax: - Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX; - break; - case Intrinsic::amdgcn_buffer_atomic_and: - Opcode = AMDGPUISD::BUFFER_ATOMIC_AND; - break; - case Intrinsic::amdgcn_buffer_atomic_or: - Opcode = AMDGPUISD::BUFFER_ATOMIC_OR; - break; - case Intrinsic::amdgcn_buffer_atomic_xor: - Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; - break; - case Intrinsic::amdgcn_buffer_atomic_fadd: - Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD; - break; - default: - llvm_unreachable("unhandled atomic opcode"); - } - - return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, - M->getMemOperand()); - } case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); - case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: - return lowerRawBufferAtomicIntrin(Op, DAG, - AMDGPUISD::BUFFER_ATOMIC_FADD_BF16); case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); - case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: - return lowerStructBufferAtomicIntrin(Op, DAG, - AMDGPUISD::BUFFER_ATOMIC_FADD_BF16); case Intrinsic::amdgcn_raw_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); @@ -9092,29 +9113,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); - case Intrinsic::amdgcn_buffer_atomic_cmpswap: { - unsigned Slc = Op.getConstantOperandVal(7); - unsigned IdxEn = getIdxEn(Op.getOperand(5)); - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // src - Op.getOperand(3), // cmp - Op.getOperand(4), // rsrc - Op.getOperand(5), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); - - EVT VT = Op.getValueType(); - auto *M = cast<MemSDNode>(Op); - - return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, - Op->getVTList(), Ops, VT, M->getMemOperand()); - } case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: { SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG); @@ -9313,22 +9311,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmin_num: { - Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN; + Opcode = ISD::ATOMIC_LOAD_FMIN; break; } case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmax_num: { - Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX; + Opcode = ISD::ATOMIC_LOAD_FMAX; break; } default: llvm_unreachable("unhandled atomic opcode"); } - return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op), - M->getVTList(), Ops, M->getMemoryVT(), - M->getMemOperand()); + return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(), + Ops, M->getMemOperand()); } case Intrinsic::amdgcn_s_get_barrier_state: { SDValue Chain = Op->getOperand(0); @@ -9557,34 +9554,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(); }; - case Intrinsic::amdgcn_tbuffer_store: { - SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); - if (IsD16) - VData = handleD16VData(VData, DAG); - unsigned Dfmt = Op.getConstantOperandVal(8); - unsigned Nfmt = Op.getConstantOperandVal(9); - unsigned Glc = Op.getConstantOperandVal(10); - unsigned Slc = Op.getConstantOperandVal(11); - unsigned IdxEn = getIdxEn(Op.getOperand(4)); - SDValue Ops[] = { - Chain, - VData, // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - Op.getOperand(5), // voffset - Op.getOperand(6), // soffset - Op.getOperand(7), // offset - DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : - AMDGPUISD::TBUFFER_STORE_FORMAT; - MemSDNode *M = cast<MemSDNode>(Op); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, - M->getMemoryVT(), M->getMemOperand()); - } case Intrinsic::amdgcn_struct_tbuffer_store: case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { @@ -9642,42 +9611,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_buffer_store: - case Intrinsic::amdgcn_buffer_store_format: { - SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); - if (IsD16) - VData = handleD16VData(VData, DAG); - unsigned Glc = Op.getConstantOperandVal(6); - unsigned Slc = Op.getConstantOperandVal(7); - unsigned IdxEn = getIdxEn(Op.getOperand(4)); - SDValue Ops[] = { - Chain, - VData, - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); - - unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? - AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; - Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; - MemSDNode *M = cast<MemSDNode>(Op); - - // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics - EVT VDataType = VData.getValueType().getScalarType(); - if (VDataType == MVT::i8 || VDataType == MVT::i16) - return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); - - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, - M->getMemoryVT(), M->getMemOperand()); - } - case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_raw_ptr_buffer_store: case Intrinsic::amdgcn_raw_buffer_store_format: @@ -10083,8 +10016,8 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( return {N0, SDValue(C1, 0)}; } -// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the -// three offsets (voffset, soffset and instoffset) into the SDValue[3] array +// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store +// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, @@ -10215,7 +10148,7 @@ SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType, SDLoc DL, SDValue Ops[], MemSDNode *M) const { - if (VDataType == MVT::f16) + if (VDataType == MVT::f16 || VDataType == MVT::bf16) Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]); SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); @@ -16063,8 +15996,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, case ISD::INTRINSIC_W_CHAIN: return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1)); case AMDGPUISD::ATOMIC_CMP_SWAP: - case AMDGPUISD::ATOMIC_LOAD_FMIN: - case AMDGPUISD::ATOMIC_LOAD_FMAX: case AMDGPUISD::BUFFER_ATOMIC_SWAP: case AMDGPUISD::BUFFER_ATOMIC_ADD: case AMDGPUISD::BUFFER_ATOMIC_SUB: @@ -16080,7 +16011,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: case AMDGPUISD::BUFFER_ATOMIC_CSUB: case AMDGPUISD::BUFFER_ATOMIC_FADD: - case AMDGPUISD::BUFFER_ATOMIC_FADD_BF16: case AMDGPUISD::BUFFER_ATOMIC_FMIN: case AMDGPUISD::BUFFER_ATOMIC_FMAX: // Target-specific read-modify-write atomics are sources of divergence. @@ -16173,6 +16103,26 @@ static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) { << " operation at memory scope " << MemScope; } +static bool isHalf2OrBFloat2(Type *Ty) { + if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { + Type *EltTy = VT->getElementType(); + return VT->getNumElements() == 2 && + (EltTy->isHalfTy() || EltTy->isBFloatTy()); + } + + return false; +} + +static bool isHalf2(Type *Ty) { + FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty); + return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy(); +} + +static bool isBFloat2(Type *Ty) { + FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty); + return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy(); +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); @@ -16231,7 +16181,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { : AtomicExpansionKind::CmpXChg; } - // TODO: Handle v2f16/v2bf16 cases for gfx940 + if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty)) + return AtomicExpansionKind::None; + return AtomicExpansionKind::CmpXChg; } @@ -16239,10 +16191,36 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { AS != AMDGPUAS::BUFFER_FAT_POINTER) return AtomicExpansionKind::CmpXChg; - // TODO: gfx940 supports v2f16 and v2bf16 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy())) return AtomicExpansionKind::None; + if (AS == AMDGPUAS::FLAT_ADDRESS) { + // gfx940, gfx12 + // FIXME: Needs to account for no fine-grained memory + if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty)) + return AtomicExpansionKind::None; + } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) { + // gfx90a, gfx940, gfx12 + // FIXME: Needs to account for no fine-grained memory + if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) + return AtomicExpansionKind::None; + + // gfx940, gfx12 + // FIXME: Needs to account for no fine-grained memory + if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty)) + return AtomicExpansionKind::None; + } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) { + // gfx90a, gfx940, gfx12 + // FIXME: Needs to account for no fine-grained memory + if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) + return AtomicExpansionKind::None; + + // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for + // buffer. gfx12 does have the buffer version. + if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty)) + return AtomicExpansionKind::None; + } + if (unsafeFPAtomicsDisabled(RMW->getFunction())) return AtomicExpansionKind::CmpXChg; @@ -16284,17 +16262,51 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { return AtomicExpansionKind::CmpXChg; } case AtomicRMWInst::FMin: - case AtomicRMWInst::FMax: + case AtomicRMWInst::FMax: { + Type *Ty = RMW->getType(); + + // LDS float and double fmin/fmax were always supported. + if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy())) + return AtomicExpansionKind::None; + + if (unsafeFPAtomicsDisabled(RMW->getFunction())) + return AtomicExpansionKind::CmpXChg; + + // Always expand system scope fp atomics. + if (HasSystemScope) + return AtomicExpansionKind::CmpXChg; + + // For flat and global cases: + // float, double in gfx7. Manual claims denormal support. + // Removed in gfx8. + // float, double restored in gfx10. + // double removed again in gfx11, so only f32 for gfx11/gfx12. + // + // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no + // f32. + // + // FIXME: Check scope and fine grained memory + if (AS == AMDGPUAS::FLAT_ADDRESS) { + if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) || + AS == AMDGPUAS::BUFFER_FAT_POINTER) { + if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + } + + return AtomicExpansionKind::CmpXChg; + } case AtomicRMWInst::Min: case AtomicRMWInst::Max: case AtomicRMWInst::UMin: case AtomicRMWInst::UMax: { if (AMDGPU::isFlatGlobalAddrSpace(AS) || AS == AMDGPUAS::BUFFER_FAT_POINTER) { - if (RMW->getType()->isFloatTy() && - unsafeFPAtomicsDisabled(RMW->getFunction())) - return AtomicExpansionKind::CmpXChg; - // Always expand system scope min/max atomics. if (HasSystemScope) return AtomicExpansionKind::CmpXChg; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 4c02bb1b306e..1f198a92c0fa 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -253,9 +253,9 @@ public: bool shouldExpandVectorDynExt(SDNode *N) const; private: - // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the - // three offsets (voffset, soffset and instoffset) into the SDValue[3] array - // pointed to by Offsets. + // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store + // the three offsets (voffset, soffset and instoffset) into the SDValue[3] + // array pointed to by Offsets. void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, Align Alignment = Align(4)) const; diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 230443313d72..4c53a081cdb2 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -641,7 +641,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<MachineLoopInfo>(); - AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachinePostDominatorTreeWrapperPass>(); AU.addUsedIfAvailable<AAResultsWrapperPass>(); AU.addPreserved<AAResultsWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); @@ -1118,7 +1118,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) @@ -2398,7 +2398,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); MLI = &getAnalysis<MachineLoopInfo>(); - PDT = &getAnalysis<MachinePostDominatorTree>(); + PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>()) AA = &AAR->getAAResults(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d8e21da8019a..cc1b9ac0c9ec 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2519,12 +2519,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } - case AMDGPU::ENTER_PSEUDO_WM: - case AMDGPU::EXIT_PSEUDO_WM: { - // These do nothing. - MI.eraseFromParent(); - break; - } case AMDGPU::SI_RETURN: { const MachineFunction *MF = MBB.getParent(); const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); @@ -3978,7 +3972,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .add(*Dst) .add(*Src0) .add(*Src1) - .addImm(Imm); + .addImm(Imm) + .setMIFlags(MI.getFlags()); updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); @@ -3997,7 +3992,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .add(*Dst) .add(*Src0) .addImm(Imm) - .add(*Src2); + .add(*Src2) + .setMIFlags(MI.getFlags()); updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); @@ -4018,7 +4014,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .add(*Dst) .add(*Src1) .addImm(Imm) - .add(*Src2); + .add(*Src2) + .setMIFlags(MI.getFlags()); updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); @@ -4054,7 +4051,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .addImm(Src2Mods ? Src2Mods->getImm() : 0) .add(*Src2) .addImm(Clamp ? Clamp->getImm() : 0) - .addImm(Omod ? Omod->getImm() : 0); + .addImm(Omod ? Omod->getImm() : 0) + .setMIFlags(MI.getFlags()); if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel)) MIB.addImm(OpSel ? OpSel->getImm() : 0); updateLiveVariables(LV, MI, *MIB); @@ -5657,24 +5655,9 @@ unsigned SIInstrInfo::buildExtractSubReg( DebugLoc DL = MI->getDebugLoc(); Register SubReg = MRI.createVirtualRegister(SubRC); - if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { - BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) - .addReg(SuperReg.getReg(), 0, SubIdx); - return SubReg; - } - - // Just in case the super register is itself a sub-register, copy it to a new - // value so we don't need to worry about merging its subreg index with the - // SubIdx passed to this function. The register coalescer should be able to - // eliminate this extra copy. - Register NewSuperReg = MRI.createVirtualRegister(SuperRC); - - BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) - .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); - + unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx); BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) - .addReg(NewSuperReg, 0, SubIdx); - + .addReg(SuperReg.getReg(), 0, NewSubIdx); return SubReg; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 40289f2addfd..c64b3a7c356f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -72,14 +72,6 @@ def SDTAtomic2_f32 : SDTypeProfile<1, 2, [ SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1> ]>; -def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32, - [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - -def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32, - [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - // load_d16_{lo|hi} ptr, tied_input def SIload_d16 : SDTypeProfile<1, 2, [ SDTCisPtrTy<1>, @@ -222,7 +214,6 @@ defm SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">; defm SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; defm SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">; defm SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; -defm SIbuffer_atomic_fadd_bf16 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD_BF16">; defm SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">; defm SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">; defm SIbuffer_atomic_cond_sub_u32 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32">; @@ -315,13 +306,6 @@ class isIntType<ValueType SrcVT> { } //===----------------------------------------------------------------------===// -// PatFrags for global memory operations -//===----------------------------------------------------------------------===// - -defm atomic_load_fmin : binary_atomic_op_all_as<SIatomic_fmin, 0>; -defm atomic_load_fmax : binary_atomic_op_all_as<SIatomic_fmax, 0>; - -//===----------------------------------------------------------------------===// // SDNodes PatFrags for loads/stores with a glue input. // This is for SDNodes and PatFrag for local loads and stores to // enable s_mov_b32 m0, -1 to be glued to the memory instructions. @@ -709,15 +693,24 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, >; let AddressSpaces = StoreAddress_local.AddrSpaces in { - defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; - defm _local_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), - IsInt>; + + if IsInt then { + defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; + defm _local_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; + } else { + defm _local_m0 : binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>; + defm _local_m0 : noret_binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>; + } } let AddressSpaces = StoreAddress_region.AddrSpaces in { - defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; - defm _region_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), - IsInt>; + if IsInt then { + defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; + defm _region_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; + } else { + defm _region_m0 : binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>; + defm _region_m0 : noret_binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>; + } } } @@ -734,8 +727,8 @@ defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; defm atomic_swap : SIAtomicM0Glue2 <"SWAP">; defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>; -defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>; -defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>; +defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 0, SDTAtomic2_f32, 0>; +defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 0, SDTAtomic2_f32, 0>; def as_i1timm : SDNodeXForm<timm, [{ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); @@ -2233,13 +2226,12 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, // Return an AGPR+VGPR operand class for the given VGPR register class. class getLdStRegisterOperand<RegisterClass RC> { RegisterOperand ret = - !if(!eq(RC.Size, 32), AVLdSt_32, - !if(!eq(RC.Size, 64), AVLdSt_64, - !if(!eq(RC.Size, 96), AVLdSt_96, - !if(!eq(RC.Size, 128), AVLdSt_128, - !if(!eq(RC.Size, 160), AVLdSt_160, - RegisterOperand<VReg_1> // invalid register - ))))); + !cond(!eq(RC.Size, 32) : AVLdSt_32, + !eq(RC.Size, 64) : AVLdSt_64, + !eq(RC.Size, 96) : AVLdSt_96, + !eq(RC.Size, 128) : AVLdSt_128, + !eq(RC.Size, 160) : AVLdSt_160, + !eq(RC.Size, 1024) : AVLdSt_1024); } class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32, @@ -2271,6 +2263,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field bit EnableClamp = _EnableClamp; field bit IsTrue16 = 0; field bit IsRealTrue16 = 0; + field bit IsInvalidSingleUseConsumer = 0; + field bit IsInvalidSingleUseProducer = 0; field ValueType DstVT = ArgVT[0]; field ValueType Src0VT = ArgVT[1]; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index c1b844f844c3..835f44f9d0d6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -217,21 +217,6 @@ def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$m def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>; } // End usesCustomInserter = 1 -// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes. -def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { - let Uses = [EXEC]; - let Defs = [EXEC]; - let hasSideEffects = 0; - let mayLoad = 0; - let mayStore = 0; -} - -def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { - let hasSideEffects = 0; - let mayLoad = 0; - let mayStore = 0; -} - // Pseudo instructions used for @llvm.fptrunc.round upward // and @llvm.fptrunc.round downward. // These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD @@ -252,16 +237,22 @@ def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), // restoring it after we're done. let Defs = [SCC], isConvergent = 1 in { def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), - (ins VSrc_b32: $src, VSrc_b32:$inactive), - [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { -} + (ins VSrc_b32: $src, VSrc_b32:$inactive), []>; def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), - (ins VSrc_b64: $src, VSrc_b64:$inactive), - [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { -} + (ins VSrc_b64: $src, VSrc_b64:$inactive), []>; } // End Defs = [SCC] +foreach vt = Reg32Types.types in { +def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)), + (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>; +} + +foreach vt = Reg64Types.types in { +def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)), + (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>; +} + def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), (V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>; @@ -3398,7 +3389,7 @@ def : GCNPat< // FIXME: Should also do this for readlane, but tablegen crashes on // the ignored src1. def : GCNPat< - (int_amdgcn_readfirstlane (i32 imm:$src)), + (i32 (int_amdgcn_readfirstlane (i32 imm:$src))), (S_MOV_B32 SReg_32:$src) >; @@ -3872,11 +3863,6 @@ def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction { let mayStore = 1; } -let Namespace = "AMDGPU" in { -def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP; -def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP; -} - class BufferAtomicGenericInstruction : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, @@ -3901,7 +3887,6 @@ def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction; -def G_AMDGPU_BUFFER_ATOMIC_FADD_BF16 : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction; diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp index abb72e8e63c3..afc6353ec811 100644 --- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp @@ -48,8 +48,8 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -60,7 +60,7 @@ char SILateBranchLowering::ID = 0; INITIALIZE_PASS_BEGIN(SILateBranchLowering, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(SILateBranchLowering, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) @@ -149,7 +149,7 @@ bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 5dc3457b5bfa..75a1575f2180 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -149,7 +149,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addUsedIfAvailable<LiveIntervals>(); // Should preserve the same set that TwoAddressInstructions does. - AU.addPreserved<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addPreserved<SlotIndexes>(); AU.addPreserved<LiveIntervals>(); AU.addPreservedID(LiveVariablesID); @@ -764,7 +764,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { LIS = getAnalysisIfAvailable<LiveIntervals>(); // This doesn't actually need LiveVariables, but we can preserve them. LV = getAnalysisIfAvailable<LiveVariables>(); - MDT = getAnalysisIfAvailable<MachineDominatorTree>(); + auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>(); + MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; MRI = &MF.getRegInfo(); BoolRC = TRI->getBoolRC(); diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index 32dad0c425c0..a9ee74dec120 100644 --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -51,8 +51,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); - AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addRequired<MachinePostDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -399,8 +399,8 @@ private: INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, false) @@ -445,8 +445,9 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { MachineFunctionProperties::Property::Selected)) return false; - Vreg1LoweringHelper Helper(&TheMF, &getAnalysis<MachineDominatorTree>(), - &getAnalysis<MachinePostDominatorTree>()); + Vreg1LoweringHelper Helper( + &TheMF, &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(), + &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree()); bool Changed = false; Changed |= Helper.lowerCopiesFromI1(); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 072c5aedc220..d9db0f7a4f53 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -83,7 +83,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, if (CC != CallingConv::AMDGPU_Gfx) ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; - // TODO: Pick a high register, and shift down, similar to a kernel. FrameOffsetReg = AMDGPU::SGPR33; StackPtrOffsetReg = AMDGPU::SGPR32; @@ -233,6 +232,12 @@ Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { return ArgInfo.FlatScratchInit.getRegister(); } +Register SIMachineFunctionInfo::addPrivateSegmentSize(const SIRegisterInfo &TRI) { + ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(getNextUserSGPR()); + NumUserSGPRs += 1; + return ArgInfo.PrivateSegmentSize.getRegister(); +} + Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 9fe02e24c8a1..7af5e7388f84 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -752,6 +752,7 @@ public: Register addKernargSegmentPtr(const SIRegisterInfo &TRI); Register addDispatchID(const SIRegisterInfo &TRI); Register addFlatScratchInit(const SIRegisterInfo &TRI); + Register addPrivateSegmentSize(const SIRegisterInfo &TRI); Register addImplicitBufferPtr(const SIRegisterInfo &TRI); Register addLDSKernelId(); SmallVectorImpl<MCRegister> * diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp index 8204a70e72d9..18d66e419152 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -148,10 +148,10 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LiveVariables>(); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineLoopInfo>(); AU.addPreserved<LiveVariables>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addPreserved<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -618,7 +618,7 @@ char SIOptimizeVGPRLiveRange::ID = 0; INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE, "SI Optimize VGPR LiveRange", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(LiveVariables) INITIALIZE_PASS_END(SIOptimizeVGPRLiveRange, DEBUG_TYPE, @@ -635,7 +635,7 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); Loops = &getAnalysis<MachineLoopInfo>(); LV = &getAnalysis<LiveVariables>(); MRI = &MF.getRegInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 1fadd8ce45b1..f47731bf6aac 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -37,20 +37,22 @@ STATISTIC(NumSDWAInstructionsPeepholed, namespace { +bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, + const SIInstrInfo *TII); class SDWAOperand; class SDWADstOperand; -class SIPeepholeSDWA : public MachineFunctionPass { -public: - using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; +using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; +using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>; +class SIPeepholeSDWA : public MachineFunctionPass { private: MachineRegisterInfo *MRI; const SIRegisterInfo *TRI; const SIInstrInfo *TII; MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; - MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches; + SDWAOperandsMap PotentialMatches; SmallVector<MachineInstr *, 8> ConvertedInstructions; std::optional<int64_t> foldToImm(const MachineOperand &Op) const; @@ -65,7 +67,6 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineBasicBlock &MBB); std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); - bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; void pseudoOpConvertToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); @@ -93,7 +94,9 @@ public: virtual ~SDWAOperand() = default; - virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; + virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches = nullptr) = 0; virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; MachineOperand *getTargetOperand() const { return Target; } @@ -126,7 +129,9 @@ public: : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} - MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + MachineInstr *potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches = nullptr) override; bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; SdwaSel getSrcSel() const { return SrcSel; } @@ -153,7 +158,9 @@ public: SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} - MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + MachineInstr *potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches = nullptr) override; bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; SdwaSel getDstSel() const { return DstSel; } @@ -327,7 +334,33 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, return Mods; } -MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { +MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches) { + if (PotentialMatches != nullptr) { + // Fill out the map for all uses if all can be converted + MachineOperand *Reg = getReplacedOperand(); + if (!Reg->isReg() || !Reg->isDef()) + return nullptr; + + for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg())) + // Check that all instructions that use Reg can be converted + if (!isConvertibleToSDWA(UseMI, ST, TII)) + return nullptr; + + // Now that it's guaranteed all uses are legal, iterate over the uses again + // to add them for later conversion. + for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) { + // Should not get a subregister here + assert(isSameReg(UseMO, *Reg)); + + SDWAOperandsMap &potentialMatchesMap = *PotentialMatches; + MachineInstr *UseMI = UseMO.getParent(); + potentialMatchesMap[UseMI].push_back(this); + } + return nullptr; + } + // For SDWA src operand potential instruction is one that use register // defined by parent instruction MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); @@ -420,7 +453,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { return true; } -MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { +MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches) { // For SDWA dst operand potential instruction is one that defines register // that this operand uses MachineRegisterInfo *MRI = getMRI(); @@ -919,8 +954,10 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI); } -bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, - const GCNSubtarget &ST) const { +namespace { +bool isConvertibleToSDWA(MachineInstr &MI, + const GCNSubtarget &ST, + const SIInstrInfo* TII) { // Check if this is already an SDWA instruction unsigned Opc = MI.getOpcode(); if (TII->isSDWA(Opc)) @@ -980,6 +1017,7 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, return true; } +} // namespace bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands) { @@ -1215,7 +1253,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { matchSDWAOperands(MBB); for (const auto &OperandPair : SDWAOperands) { const auto &Operand = OperandPair.second; - MachineInstr *PotentialMI = Operand->potentialToConvert(TII); + MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); if (PotentialMI && (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) @@ -1228,8 +1266,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { for (const auto &OperandPair : SDWAOperands) { const auto &Operand = OperandPair.second; - MachineInstr *PotentialMI = Operand->potentialToConvert(TII); - if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { + MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches); + if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) { PotentialMatches[PotentialMI].push_back(Operand.get()); } } diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp index 398f870a9f53..5837dbeb3f98 100644 --- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -165,19 +165,15 @@ SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) { unsigned Opc = MI.getOpcode(); - if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM || - Opc == AMDGPU::ENTER_PSEUDO_WM) { + if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) { dbgs() << "Entering "; } else { - assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM || - Opc == AMDGPU::EXIT_PSEUDO_WM); + assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM); dbgs() << "Exiting "; } if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) { dbgs() << "Strict WWM "; - } else if (Opc == AMDGPU::ENTER_PSEUDO_WM || Opc == AMDGPU::EXIT_PSEUDO_WM) { - dbgs() << "Pseudo WWM/WQM "; } else { assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM); dbgs() << "Strict WQM "; @@ -230,16 +226,14 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { } if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM || - MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM || - MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) { + MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) { LLVM_DEBUG(printWWMInfo(MI)); InWWM = true; continue; } if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM || - MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM || - MI.getOpcode() == AMDGPU::EXIT_PSEUDO_WM) { + MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) { LLVM_DEBUG(printWWMInfo(MI)); InWWM = false; } diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 0d40816cdd4b..212edff09783 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -161,45 +161,6 @@ static const MCExpr *MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift, return Val; } -uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const { - int64_t VBlocks, SBlocks; - VGPRBlocks->evaluateAsAbsolute(VBlocks); - SGPRBlocks->evaluateAsAbsolute(SBlocks); - - uint64_t Reg = S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) | - S_00B848_SGPRS(static_cast<uint64_t>(SBlocks)) | - getComputePGMRSrc1Reg(*this, ST); - - return Reg; -} - -uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC, - const GCNSubtarget &ST) const { - if (AMDGPU::isCompute(CC)) { - return getComputePGMRSrc1(ST); - } - int64_t VBlocks, SBlocks; - VGPRBlocks->evaluateAsAbsolute(VBlocks); - SGPRBlocks->evaluateAsAbsolute(SBlocks); - - return getPGMRSrc1Reg(*this, CC, ST) | - S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) | - S_00B848_SGPRS(static_cast<uint64_t>(SBlocks)); -} - -uint64_t SIProgramInfo::getComputePGMRSrc2() const { - int64_t ScratchEn; - ScratchEnable->evaluateAsAbsolute(ScratchEn); - return ScratchEn | getComputePGMRSrc2Reg(*this); -} - -uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const { - if (AMDGPU::isCompute(CC)) - return getComputePGMRSrc2(); - - return 0; -} - const MCExpr *SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const { uint64_t Reg = getComputePGMRSrc1Reg(*this, ST); diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index e66e5a194c8b..c358a2d9db10 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -98,16 +98,12 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo { void reset(const MachineFunction &MF); /// Compute the value of the ComputePGMRsrc1 register. - uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const; - uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const; const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const; const MCExpr *getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST, MCContext &Ctx) const; /// Compute the value of the ComputePGMRsrc2 register. - uint64_t getComputePGMRSrc2() const; - uint64_t getPGMRSrc2(CallingConv::ID CC) const; const MCExpr *getComputePGMRSrc2(MCContext &Ctx) const; const MCExpr *getPGMRSrc2(CallingConv::ID CC, MCContext &Ctx) const; }; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 4b5f9bdd82b8..4c5e60c873bb 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3157,7 +3157,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const { - auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); + auto &MDT = LIS->getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); SlotIndex UseIdx = LIS->getInstructionIndex(Use); SlotIndex DefIdx; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index caac7126068e..f1d9aec16363 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -586,7 +586,9 @@ class RegisterTypes<list<ValueType> reg_types> { def Reg16Types : RegisterTypes<[i16, f16, bf16]>; def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>; -def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0]>; +def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, v4i16, v4f16, v4bf16]>; +def Reg96Types : RegisterTypes<[v3i32, v3f32]>; +def Reg128Types : RegisterTypes<[v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16]>; let HasVGPR = 1 in { // VOP3 and VINTERP can access 256 lo and 256 hi registers. @@ -744,7 +746,7 @@ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, let BaseClassOrder = 10000; } -def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16, v8bf16], 32, +def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", Reg128Types.types, 32, (add PRIVATE_RSRC_REG)> { let isAllocatable = 0; let CopyCost = -1; @@ -815,7 +817,7 @@ def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v let HasSGPR = 1; } -def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4bf16], 32, +def SGPR_64 : SIRegisterClass<"AMDGPU", Reg64Types.types, 32, (add SGPR_64Regs)> { let CopyCost = 1; let AllocationPriority = 1; @@ -905,8 +907,8 @@ multiclass SRegClass<int numRegs, } } -defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>; -defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], SGPR_128Regs, TTMP_128Regs>; +defm "" : SRegClass<3, Reg96Types.types, SGPR_96Regs, TTMP_96Regs>; +defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs>; defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; @@ -958,8 +960,8 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> { defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4bf16, v4i16, p0, p1, p4], (add VGPR_64)>; -defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>; -defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add VGPR_128)>; +defm VReg_96 : VRegClass<3, Reg96Types.types, (add VGPR_96)>; +defm VReg_128 : VRegClass<4, Reg128Types.types, (add VGPR_128)>; defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>; @@ -1342,6 +1344,7 @@ def AVLdSt_64 : AVLdStOperand<AV_64, "OPW64">; def AVLdSt_96 : AVLdStOperand<AV_96, "OPW96">; def AVLdSt_128 : AVLdStOperand<AV_128, "OPW128">; def AVLdSt_160 : AVLdStOperand<AV_160, "OPW160">; +def AVLdSt_1024 : AVLdStOperand<AV_1024, "OPW1024">; //===----------------------------------------------------------------------===// // ACSrc_* Operands with an AGPR or an inline constant diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 647fae904d39..79bcf5e8cd30 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -45,7 +45,6 @@ public: bool isKImmOperand(const MachineOperand &Src) const; bool isKUImmOperand(const MachineOperand &Src) const; bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; - bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const; void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; void shrinkScalarCompare(MachineInstr &MI) const; void shrinkMIMG(MachineInstr &MI) const; @@ -183,15 +182,36 @@ bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, return false; } -/// \returns true if the constant in \p Src should be replaced with a bitreverse -/// of an inline immediate. -bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src, - int32_t &ReverseImm) const { - if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) - return false; +/// \returns the opcode of an instruction a move immediate of the constant \p +/// Src can be replaced with if the constant is replaced with \p ModifiedImm. +/// i.e. +/// +/// If the bitreverse of a constant is an inline immediate, reverse the +/// immediate and return the bitreverse opcode. +/// +/// If the bitwise negation of a constant is an inline immediate, reverse the +/// immediate and return the bitwise not opcode. +static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII, + const MachineOperand &Src, + int32_t &ModifiedImm, bool Scalar) { + if (TII->isInlineConstant(Src)) + return 0; + int32_t SrcImm = static_cast<int32_t>(Src.getImm()); + + if (!Scalar) { + // We could handle the scalar case with here, but we would need to check + // that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth + // it, as the reasonable values are already covered by s_movk_i32. + ModifiedImm = ~SrcImm; + if (TII->isInlineConstant(APInt(32, ModifiedImm))) + return AMDGPU::V_NOT_B32_e32; + } + + ModifiedImm = reverseBits<int32_t>(SrcImm); + if (TII->isInlineConstant(APInt(32, ModifiedImm))) + return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32; - ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); - return ReverseImm >= -16 && ReverseImm <= 64; + return 0; } /// Copy implicit register operands from specified instruction to this @@ -801,10 +821,12 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // XXX - not exactly a check for post-regalloc run. MachineOperand &Src = MI.getOperand(1); if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { - int32_t ReverseImm; - if (isReverseInlineImm(Src, ReverseImm)) { - MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); - Src.setImm(ReverseImm); + int32_t ModImm; + unsigned ModOpcode = + canModifyToInlineImmOp32(TII, Src, ModImm, /*Scalar=*/false); + if (ModOpcode != 0) { + MI.setDesc(TII->get(ModOpcode)); + Src.setImm(static_cast<int64_t>(ModImm)); continue; } } @@ -863,13 +885,15 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineOperand &Src = MI.getOperand(1); if (Src.isImm() && Dst.getReg().isPhysical()) { - int32_t ReverseImm; + unsigned ModOpc; + int32_t ModImm; if (isKImmOperand(Src)) { MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); Src.setImm(SignExtend64(Src.getImm(), 32)); - } else if (isReverseInlineImm(Src, ReverseImm)) { - MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); - Src.setImm(ReverseImm); + } else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModImm, + /*Scalar=*/true))) { + MI.setDesc(TII->get(ModOpc)); + Src.setImm(static_cast<int64_t>(ModImm)); } } diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 913942dda19d..742fd397ff9e 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -215,8 +215,6 @@ private: MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI, bool IsWQM); MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI); - void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry, - MachineInstr *Exit); void lowerBlock(MachineBasicBlock &MBB); void processBlock(MachineBasicBlock &MBB, bool IsEntry); @@ -241,8 +239,8 @@ public: AU.addRequired<LiveIntervals>(); AU.addPreserved<SlotIndexes>(); AU.addPreserved<LiveIntervals>(); - AU.addPreserved<MachineDominatorTree>(); - AU.addPreserved<MachinePostDominatorTree>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachinePostDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -259,8 +257,8 @@ char SIWholeQuadMode::ID = 0; INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) @@ -785,7 +783,7 @@ MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB, if (MDT) MDT->getBase().applyUpdates(DTUpdates); if (PDT) - PDT->getBase().applyUpdates(DTUpdates); + PDT->applyUpdates(DTUpdates); // Link blocks MachineInstr *MI = @@ -1025,31 +1023,6 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, return NewTerm; } -// Convert a strict mode transition to a pseudo transition. -// This still pre-allocates registers to prevent clobbering, -// but avoids any EXEC mask changes. -void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB, - MachineInstr *Entry, - MachineInstr *Exit) { - assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM); - assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM); - - Register SaveOrig = Entry->getOperand(0).getReg(); - - MachineInstr *NewEntry = - BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM)); - MachineInstr *NewExit = - BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM)); - - LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit); - Exit->eraseFromParent(); - - LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry); - Entry->eraseFromParent(); - - LIS->removeInterval(SaveOrig); -} - // Replace (or supplement) instructions accessing live mask. // This can only happen once all the live mask registers have been created // and the execute state (WQM/StrictWWM/Exact) of instructions is known. @@ -1066,12 +1039,9 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { SmallVector<MachineInstr *, 4> SplitPoints; char State = BI.InitialState; - MachineInstr *StrictEntry = nullptr; for (MachineInstr &MI : llvm::make_early_inc_range( llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) { - char PreviousState = State; - if (StateTransition.count(&MI)) State = StateTransition[&MI]; @@ -1084,20 +1054,6 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: SplitPoint = lowerKillF32(MBB, MI); break; - case AMDGPU::ENTER_STRICT_WQM: - StrictEntry = PreviousState == StateWQM ? &MI : nullptr; - break; - case AMDGPU::EXIT_STRICT_WQM: - if (State == StateWQM && StrictEntry) { - // Transition WQM -> StrictWQM -> WQM detected. - lowerPseudoStrictMode(MBB, StrictEntry, &MI); - } - StrictEntry = nullptr; - break; - case AMDGPU::ENTER_STRICT_WWM: - case AMDGPU::EXIT_STRICT_WWM: - StrictEntry = nullptr; - break; default: break; } @@ -1251,11 +1207,6 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB, } LIS->InsertMachineInstrInMaps(*MI); StateTransition[MI] = StrictStateNeeded; - - // Mark block as needing lower so it will be checked for unnecessary transitions. - auto BII = Blocks.find(&MBB); - if (BII != Blocks.end()) - BII->second.NeedsLowering = true; } void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB, @@ -1687,8 +1638,11 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); LIS = &getAnalysis<LiveIntervals>(); - MDT = getAnalysisIfAvailable<MachineDominatorTree>(); - PDT = getAnalysisIfAvailable<MachinePostDominatorTree>(); + auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>(); + MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; + auto *PDTWrapper = + getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>(); + PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr; if (ST->isWave32()) { AndOpc = AMDGPU::S_AND_B32; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index aee518680a60..64f33199545a 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -215,6 +215,11 @@ let isMoveImm = 1 in { } // End Uses = [SCC] } // End isMoveImm = 1 +// Variant of S_MOV_B32 used for reading from volatile registers like +// SRC_POPS_EXITING_WAVE_ID. +let hasSideEffects = 1 in +def S_MOV_B32_sideeffects : SOP1_32 <"s_mov_b32">; + let Defs = [SCC] in { def S_NOT_B32 : SOP1_32 <"s_not_b32", [(set i32:$sdst, (UniformUnaryFrag<not> i32:$src0))] @@ -1196,11 +1201,15 @@ let SubtargetPredicate = isGFX9Plus in { } } // End SubtargetPredicate = isGFX9Plus +def VersionImm : S16ImmOperand { + let DecoderMethod = "decodeVersionImm"; +} + let SubtargetPredicate = isGFX10Plus in { def S_VERSION : SOPK_Pseudo< "s_version", (outs), - (ins s16imm:$simm16), + (ins VersionImm:$simm16), "$simm16"> { let has_sdst = 0; } @@ -1876,6 +1885,12 @@ let SubtargetPredicate = isNotGFX9Plus in { def : GetFPModePat<fpmode_mask_gfx6plus>; } +let SubtargetPredicate = isGFX9GFX10 in +def : GCNPat< + (int_amdgcn_pops_exiting_wave_id), + (S_MOV_B32_sideeffects (i32 SRC_POPS_EXITING_WAVE_ID)) +>; + //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 2e1db1665b9c..3af536dac473 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -669,5 +669,20 @@ const char* const IdSymbolic[] = { } // namespace VGPRIndexMode +namespace UCVersion { + +ArrayRef<GFXVersion> getGFXVersions() { + // GFX6, GFX8 and GFX9 don't support s_version and there are no + // UC_VERSION_GFX* codes for them. + static const GFXVersion Versions[] = {{"UC_VERSION_GFX7", 0}, + {"UC_VERSION_GFX10", 4}, + {"UC_VERSION_GFX11", 6}, + {"UC_VERSION_GFX12", 9}}; + + return Versions; +} + +} // namespace UCVersion + } // namespace AMDGPU } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h index 069134a7ae7f..c84c1a7dc18c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -116,6 +116,17 @@ extern const char* const IdSymbolic[]; } // namespace VGPRIndexMode +namespace UCVersion { + +struct GFXVersion { + StringLiteral Symbol; + unsigned Code; +}; + +ArrayRef<GFXVersion> getGFXVersions(); + +} // namespace UCVersion + } // namespace AMDGPU } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 4b34fb27632a..9886235121d2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -159,6 +159,12 @@ namespace llvm { namespace AMDGPU { +/// \returns true if the target supports signed immediate offset for SMRD +/// instructions. +bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) { + return isGFX9Plus(ST); +} + /// \returns True if \p STI is AMDHSA. bool isHsaAbi(const MCSubtargetInfo &STI) { return STI.getTargetTriple().getOS() == Triple::AMDHSA; @@ -373,10 +379,18 @@ struct VOPTrue16Info { bool IsTrue16; }; +struct SingleUseExceptionInfo { + uint16_t Opcode; + bool IsInvalidSingleUseConsumer; + bool IsInvalidSingleUseProducer; +}; + #define GET_MTBUFInfoTable_DECL #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL #define GET_MUBUFInfoTable_IMPL +#define GET_SingleUseExceptionTable_DECL +#define GET_SingleUseExceptionTable_IMPL #define GET_SMInfoTable_DECL #define GET_SMInfoTable_IMPL #define GET_VOP1InfoTable_DECL @@ -582,9 +596,7 @@ bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) { } bool isGenericAtomic(unsigned Opc) { - return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN || - Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX || - Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP || + return Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP || Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD || Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB || Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN || @@ -608,6 +620,16 @@ bool isTrue16Inst(unsigned Opc) { return Info ? Info->IsTrue16 : false; } +bool isInvalidSingleUseConsumerInst(unsigned Opc) { + const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc); + return Info && Info->IsInvalidSingleUseConsumer; +} + +bool isInvalidSingleUseProducerInst(unsigned Opc) { + const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc); + return Info && Info->IsInvalidSingleUseProducer; +} + unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) { const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc); return Info ? Info->Opcode3Addr : ~0u; @@ -2803,10 +2825,6 @@ static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) { return isGCN3Encoding(ST) || isGFX10Plus(ST); } -static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) { - return isGFX9Plus(ST); -} - bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, int64_t EncodedOffset) { if (isGFX12Plus(ST)) @@ -2841,7 +2859,14 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, } std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST, - int64_t ByteOffset, bool IsBuffer) { + int64_t ByteOffset, bool IsBuffer, + bool HasSOffset) { + // For unbuffered smem loads, it is illegal for the Immediate Offset to be + // negative if the resulting (Offset + (M0 or SOffset or zero) is negative. + // Handle case where SOffset is not present. + if (!IsBuffer && !HasSOffset && ByteOffset < 0 && hasSMRDSignedImmOffset(ST)) + return std::nullopt; + if (isGFX12Plus(ST)) // 24 bit signed offsets return isInt<24>(ByteOffset) ? std::optional<int64_t>(ByteOffset) : std::nullopt; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index cf8236b8e23b..af2f0bc1a630 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -360,6 +360,10 @@ struct EncodingField { static ValueType decode(uint64_t Encoded) { return Encoded; } }; +// Represents a single bit in an encoded value. +template <unsigned Bit, unsigned D = 0> +using EncodingBit = EncodingField<Bit, Bit, D>; + // A helper for encoding and decoding multiple fields. template <typename... Fields> struct EncodingFields { static constexpr uint64_t encode(Fields... Values) { @@ -857,6 +861,12 @@ LLVM_READONLY bool isTrue16Inst(unsigned Opc); LLVM_READONLY +bool isInvalidSingleUseConsumerInst(unsigned Opc); + +LLVM_READONLY +bool isInvalidSingleUseProducerInst(unsigned Opc); + +LLVM_READONLY unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc); LLVM_READONLY @@ -1297,6 +1307,7 @@ bool hasVOPD(const MCSubtargetInfo &STI); bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI); int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR); unsigned hasKernargPreload(const MCSubtargetInfo &STI); +bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); @@ -1469,7 +1480,8 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset); /// S_LOAD instructions have a signed offset, on other subtargets it is /// unsigned. S_BUFFER has an unsigned offset for all subtargets. std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST, - int64_t ByteOffset, bool IsBuffer); + int64_t ByteOffset, bool IsBuffer, + bool HasSOffset = false); /// \return The encoding that can be used for a 32-bit literal offset in an SMRD /// instruction. This is only useful on CI.s diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp new file mode 100644 index 000000000000..a4f4a9ed5da4 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp @@ -0,0 +1,61 @@ +//===- AMDGPUDelayedMCExpr.cpp - Delayed MCExpr resolve ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUDelayedMCExpr.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCValue.h" + +using namespace llvm; + +static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, + MCValue Val) { + msgpack::Document *Doc = DN.getDocument(); + switch (Type) { + default: + return Doc->getEmptyNode(); + case msgpack::Type::Int: + return Doc->getNode(static_cast<int64_t>(Val.getConstant())); + case msgpack::Type::UInt: + return Doc->getNode(static_cast<uint64_t>(Val.getConstant())); + case msgpack::Type::Boolean: + return Doc->getNode(static_cast<bool>(Val.getConstant())); + } +} + +void DelayedMCExprs::assignDocNode(msgpack::DocNode &DN, msgpack::Type Type, + const MCExpr *ExprValue) { + MCValue Res; + if (ExprValue->evaluateAsRelocatable(Res, nullptr, nullptr)) { + if (Res.isAbsolute()) { + DN = getNode(DN, Type, Res); + return; + } + } + + DelayedExprs.push_back(Expr{DN, Type, ExprValue}); +} + +bool DelayedMCExprs::resolveDelayedExpressions() { + while (!DelayedExprs.empty()) { + Expr DE = DelayedExprs.front(); + MCValue Res; + + if (!DE.ExprValue->evaluateAsRelocatable(Res, nullptr, nullptr) || + !Res.isAbsolute()) + return false; + + DelayedExprs.pop_front(); + DE.DN = getNode(DE.DN, DE.Type, Res); + } + + return true; +} + +void DelayedMCExprs::clear() { DelayedExprs.clear(); } + +bool DelayedMCExprs::empty() { return DelayedExprs.empty(); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.h new file mode 100644 index 000000000000..8c9cda3a1bdd --- /dev/null +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.h @@ -0,0 +1,39 @@ +//===- AMDGPUDelayedMCExpr.h - Delayed MCExpr resolve -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUDELAYEDMCEXPR_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUDELAYEDMCEXPR_H + +#include "llvm/BinaryFormat/MsgPackDocument.h" +#include <deque> + +namespace llvm { +class MCExpr; + +class DelayedMCExprs { + struct Expr { + msgpack::DocNode &DN; + msgpack::Type Type; + const MCExpr *ExprValue; + Expr(msgpack::DocNode &DN, msgpack::Type Type, const MCExpr *ExprValue) + : DN(DN), Type(Type), ExprValue(ExprValue) {} + }; + + std::deque<Expr> DelayedExprs; + +public: + bool resolveDelayedExpressions(); + void assignDocNode(msgpack::DocNode &DN, msgpack::Type Type, + const MCExpr *ExprValue); + void clear(); + bool empty(); +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUDELAYEDMCEXPR_H diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index 0fa67c559cb2..a53bf70d7771 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -20,6 +20,7 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Module.h" +#include "llvm/MC/MCExpr.h" #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/EndianStream.h" @@ -137,12 +138,22 @@ void AMDGPUPALMetadata::setRsrc1(CallingConv::ID CC, unsigned Val) { setRegister(getRsrc1Reg(CC), Val); } +void AMDGPUPALMetadata::setRsrc1(CallingConv::ID CC, const MCExpr *Val, + MCContext &Ctx) { + setRegister(getRsrc1Reg(CC), Val, Ctx); +} + // Set the rsrc2 register in the metadata for a particular shader stage. // In fact this ORs the value into any previous setting of the register. void AMDGPUPALMetadata::setRsrc2(CallingConv::ID CC, unsigned Val) { setRegister(getRsrc1Reg(CC) + 1, Val); } +void AMDGPUPALMetadata::setRsrc2(CallingConv::ID CC, const MCExpr *Val, + MCContext &Ctx) { + setRegister(getRsrc1Reg(CC) + 1, Val, Ctx); +} + // Set the SPI_PS_INPUT_ENA register in the metadata. // In fact this ORs the value into any previous setting of the register. void AMDGPUPALMetadata::setSpiPsInputEna(unsigned Val) { @@ -182,6 +193,40 @@ void AMDGPUPALMetadata::setRegister(unsigned Reg, unsigned Val) { N = N.getDocument()->getNode(Val); } +// Set a register in the metadata. +// In fact this ORs the value into any previous setting of the register. +void AMDGPUPALMetadata::setRegister(unsigned Reg, const MCExpr *Val, + MCContext &Ctx) { + if (!isLegacy()) { + // In the new MsgPack format, ignore register numbered >= 0x10000000. It + // is a PAL ABI pseudo-register in the old non-MsgPack format. + if (Reg >= 0x10000000) + return; + } + auto &N = getRegisters()[MsgPackDoc.getNode(Reg)]; + auto ExprIt = REM.find(Reg); + + if (ExprIt != REM.end()) { + Val = MCBinaryExpr::createOr(Val, ExprIt->getSecond(), Ctx); + // This conditional may be redundant most of the time, but the alternate + // setRegister(unsigned, unsigned) could've been called while the + // conditional returns true (i.e., Reg exists in REM). + if (N.getKind() == msgpack::Type::UInt) { + const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx); + Val = MCBinaryExpr::createOr(Val, NExpr, Ctx); + } + ExprIt->getSecond() = Val; + } else if (N.getKind() == msgpack::Type::UInt) { + const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx); + Val = MCBinaryExpr::createOr(Val, NExpr, Ctx); + int64_t Unused; + if (!Val->evaluateAsAbsolute(Unused)) + REM[Reg] = Val; + (void)Unused; + } + DelayedExprs.assignDocNode(N, msgpack::Type::UInt, Val); +} + // Set the entry point name for one shader. void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) { if (isLegacy()) @@ -207,11 +252,29 @@ void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, unsigned Val) { getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val); } +void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, const MCExpr *Val, + MCContext &Ctx) { + if (isLegacy()) { + // Old non-msgpack format. + unsigned NumUsedVgprsKey = getScratchSizeKey(CC) + + PALMD::Key::VS_NUM_USED_VGPRS - + PALMD::Key::VS_SCRATCH_SIZE; + setRegister(NumUsedVgprsKey, Val, Ctx); + return; + } + // Msgpack format. + setHwStage(CC, ".vgpr_count", msgpack::Type::UInt, Val); +} + // Set the number of used agprs in the metadata. void AMDGPUPALMetadata::setNumUsedAgprs(CallingConv::ID CC, unsigned Val) { getHwStage(CC)[".agpr_count"] = Val; } +void AMDGPUPALMetadata::setNumUsedAgprs(unsigned CC, const MCExpr *Val) { + setHwStage(CC, ".agpr_count", msgpack::Type::UInt, Val); +} + // Set the number of used sgprs in the metadata. This is an optional advisory // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of sgprs to allocate. @@ -228,6 +291,20 @@ void AMDGPUPALMetadata::setNumUsedSgprs(CallingConv::ID CC, unsigned Val) { getHwStage(CC)[".sgpr_count"] = MsgPackDoc.getNode(Val); } +void AMDGPUPALMetadata::setNumUsedSgprs(unsigned CC, const MCExpr *Val, + MCContext &Ctx) { + if (isLegacy()) { + // Old non-msgpack format. + unsigned NumUsedSgprsKey = getScratchSizeKey(CC) + + PALMD::Key::VS_NUM_USED_SGPRS - + PALMD::Key::VS_SCRATCH_SIZE; + setRegister(NumUsedSgprsKey, Val, Ctx); + return; + } + // Msgpack format. + setHwStage(CC, ".sgpr_count", msgpack::Type::UInt, Val); +} + // Set the scratch size in the metadata. void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) { if (isLegacy()) { @@ -239,6 +316,17 @@ void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) { getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val); } +void AMDGPUPALMetadata::setScratchSize(unsigned CC, const MCExpr *Val, + MCContext &Ctx) { + if (isLegacy()) { + // Old non-msgpack format. + setRegister(getScratchSizeKey(CC), Val, Ctx); + return; + } + // Msgpack format. + setHwStage(CC, ".scratch_memory_size", msgpack::Type::UInt, Val); +} + // Set the stack frame size of a function in the metadata. void AMDGPUPALMetadata::setFunctionScratchSize(StringRef FnName, unsigned Val) { auto Node = getShaderFunction(FnName); @@ -259,6 +347,12 @@ void AMDGPUPALMetadata::setFunctionNumUsedVgprs(StringRef FnName, Node[".vgpr_count"] = MsgPackDoc.getNode(Val); } +void AMDGPUPALMetadata::setFunctionNumUsedVgprs(StringRef FnName, + const MCExpr *Val) { + auto Node = getShaderFunction(FnName); + DelayedExprs.assignDocNode(Node[".vgpr_count"], msgpack::Type::UInt, Val); +} + // Set the number of used vgprs in the metadata. void AMDGPUPALMetadata::setFunctionNumUsedSgprs(StringRef FnName, unsigned Val) { @@ -266,6 +360,12 @@ void AMDGPUPALMetadata::setFunctionNumUsedSgprs(StringRef FnName, Node[".sgpr_count"] = MsgPackDoc.getNode(Val); } +void AMDGPUPALMetadata::setFunctionNumUsedSgprs(StringRef FnName, + const MCExpr *Val) { + auto Node = getShaderFunction(FnName); + DelayedExprs.assignDocNode(Node[".sgpr_count"], msgpack::Type::UInt, Val); +} + // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. void AMDGPUPALMetadata::setWave32(unsigned CC) { @@ -662,6 +762,7 @@ void AMDGPUPALMetadata::toString(std::string &String) { String.clear(); if (!BlobType) return; + ResolvedAll = DelayedExprs.resolveDelayedExpressions(); raw_string_ostream Stream(String); if (isLegacy()) { if (MsgPackDoc.getRoot().getKind() == msgpack::Type::Nil) @@ -711,6 +812,7 @@ void AMDGPUPALMetadata::toString(std::string &String) { // a .note record of the specified AMD type. Returns an empty blob if // there is no PAL metadata, void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) { + ResolvedAll = DelayedExprs.resolveDelayedExpressions(); if (Type == ELF::NT_AMD_PAL_METADATA) toLegacyBlob(Blob); else if (Type) @@ -906,11 +1008,17 @@ void AMDGPUPALMetadata::setLegacy() { // Erase all PAL metadata. void AMDGPUPALMetadata::reset() { MsgPackDoc.clear(); + REM.clear(); + DelayedExprs.clear(); Registers = MsgPackDoc.getEmptyNode(); HwStages = MsgPackDoc.getEmptyNode(); ShaderFunctions = MsgPackDoc.getEmptyNode(); } +bool AMDGPUPALMetadata::resolvedAllMCExpr() { + return ResolvedAll && DelayedExprs.empty(); +} + unsigned AMDGPUPALMetadata::getPALVersion(unsigned idx) { assert(idx < 2 && "illegal index to PAL version - should be 0 (major) or 1 (minor)"); @@ -942,6 +1050,11 @@ void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, bool Val) { getHwStage(CC)[field] = Val; } +void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, + msgpack::Type Type, const MCExpr *Val) { + DelayedExprs.assignDocNode(getHwStage(CC)[field], Type, Val); +} + void AMDGPUPALMetadata::setComputeRegisters(StringRef field, unsigned Val) { getComputeRegisters()[field] = Val; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h index 158f766d0485..e05532afed2f 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -13,7 +13,10 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H +#include "AMDGPUDelayedMCExpr.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/BinaryFormat/MsgPackDocument.h" +#include "llvm/MC/MCContext.h" namespace llvm { @@ -21,6 +24,10 @@ class Module; class StringRef; class AMDGPUPALMetadata { +public: + using RegisterExprMap = DenseMap<unsigned, const MCExpr *>; + +private: unsigned BlobType = 0; msgpack::Document MsgPackDoc; msgpack::DocNode Registers; @@ -32,6 +39,10 @@ class AMDGPUPALMetadata { msgpack::DocNode ComputeRegisters; msgpack::DocNode GraphicsRegisters; + DelayedMCExprs DelayedExprs; + RegisterExprMap REM; + bool ResolvedAll = true; + public: // Read the amdgpu.pal.metadata supplied by the frontend, ready for // per-function modification. @@ -45,10 +56,12 @@ public: // Set the rsrc1 register in the metadata for a particular shader stage. // In fact this ORs the value into any previous setting of the register. void setRsrc1(unsigned CC, unsigned Val); + void setRsrc1(unsigned CC, const MCExpr *Val, MCContext &Ctx); // Set the rsrc2 register in the metadata for a particular shader stage. // In fact this ORs the value into any previous setting of the register. void setRsrc2(unsigned CC, unsigned Val); + void setRsrc2(unsigned CC, const MCExpr *Val, MCContext &Ctx); // Set the SPI_PS_INPUT_ENA register in the metadata. // In fact this ORs the value into any previous setting of the register. @@ -64,6 +77,7 @@ public: // Set a register in the metadata. // In fact this ORs the value into any previous setting of the register. void setRegister(unsigned Reg, unsigned Val); + void setRegister(unsigned Reg, const MCExpr *Val, MCContext &Ctx); // Set the entry point name for one shader. void setEntryPoint(unsigned CC, StringRef Name); @@ -72,18 +86,22 @@ public: // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of vgprs to allocate. void setNumUsedVgprs(unsigned CC, unsigned Val); + void setNumUsedVgprs(unsigned CC, const MCExpr *Val, MCContext &Ctx); // Set the number of used agprs in the metadata. This is an optional advisory // record for logging etc; void setNumUsedAgprs(unsigned CC, unsigned Val); + void setNumUsedAgprs(unsigned CC, const MCExpr *Val); // Set the number of used sgprs in the metadata. This is an optional advisory // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of sgprs to allocate. void setNumUsedSgprs(unsigned CC, unsigned Val); + void setNumUsedSgprs(unsigned CC, const MCExpr *Val, MCContext &Ctx); // Set the scratch size in the metadata. void setScratchSize(unsigned CC, unsigned Val); + void setScratchSize(unsigned CC, const MCExpr *Val, MCContext &Ctx); // Set the stack frame size of a function in the metadata. void setFunctionScratchSize(StringRef FnName, unsigned Val); @@ -97,11 +115,13 @@ public: // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of vgprs to allocate. void setFunctionNumUsedVgprs(StringRef FnName, unsigned Val); + void setFunctionNumUsedVgprs(StringRef FnName, const MCExpr *Val); // Set the number of used sgprs in the metadata. This is an optional advisory // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of sgprs to allocate. void setFunctionNumUsedSgprs(StringRef FnName, unsigned Val); + void setFunctionNumUsedSgprs(StringRef FnName, const MCExpr *Val); // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. @@ -138,6 +158,8 @@ public: void setHwStage(unsigned CC, StringRef field, unsigned Val); void setHwStage(unsigned CC, StringRef field, bool Val); + void setHwStage(unsigned CC, StringRef field, msgpack::Type Type, + const MCExpr *Val); void setComputeRegisters(StringRef field, unsigned Val); void setComputeRegisters(StringRef field, bool Val); @@ -156,6 +178,8 @@ public: // Erase all PAL metadata. void reset(); + bool resolvedAllMCExpr(); + private: // Return whether the blob type is legacy PAL metadata. bool isLegacy() const; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp index eaee1a2a9739..720d5a1853db 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp @@ -14,6 +14,7 @@ #include "AMDKernelCodeT.h" #include "SIDefines.h" #include "Utils/AMDGPUBaseInfo.h" +#include "Utils/SIDefinesUtils.h" #include "llvm/ADT/IndexedMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCContext.h" @@ -220,43 +221,6 @@ static int get_amd_kernel_code_t_FieldIndex(StringRef name) { return map.lookup(name) - 1; // returns -1 if not found } -static constexpr std::pair<unsigned, unsigned> getShiftMask(unsigned Value) { - unsigned Shift = 0; - unsigned Mask = 0; - - Mask = ~Value; - for (; !(Mask & 1); Shift++, Mask >>= 1) { - } - - return std::make_pair(Shift, Mask); -} - -static const MCExpr *MaskShiftSet(const MCExpr *Val, uint32_t Mask, - uint32_t Shift, MCContext &Ctx) { - if (Mask) { - const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx); - Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx); - } - if (Shift) { - const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx); - Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx); - } - return Val; -} - -static const MCExpr *MaskShiftGet(const MCExpr *Val, uint32_t Mask, - uint32_t Shift, MCContext &Ctx) { - if (Shift) { - const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx); - Val = MCBinaryExpr::createLShr(Val, ShiftExpr, Ctx); - } - if (Mask) { - const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx); - Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx); - } - return Val; -} - class PrintField { public: template <typename T, T AMDGPUMCKernelCodeT::*ptr, @@ -305,10 +269,10 @@ static ArrayRef<PrintFx> getPrinterTable() { const MCExpr *Value; \ if (PGMType == 0) { \ Value = \ - MaskShiftGet(C.compute_pgm_resource1_registers, Mask, Shift, Ctx); \ + maskShiftGet(C.compute_pgm_resource1_registers, Mask, Shift, Ctx); \ } else { \ Value = \ - MaskShiftGet(C.compute_pgm_resource2_registers, Mask, Shift, Ctx); \ + maskShiftGet(C.compute_pgm_resource2_registers, Mask, Shift, Ctx); \ } \ int64_t Val; \ if (Value->evaluateAsAbsolute(Val)) \ @@ -392,7 +356,7 @@ static ArrayRef<ParseFx> getParserTable() { if (!parseExpr(MCParser, Value, Err)) \ return false; \ auto [Shift, Mask] = getShiftMask(Complement); \ - Value = MaskShiftSet(Value, Mask, Shift, Ctx); \ + Value = maskShiftSet(Value, Mask, Shift, Ctx); \ const MCExpr *Compl = MCConstantExpr::create(Complement, Ctx); \ if (PGMType == 0) { \ C.compute_pgm_resource1_registers = MCBinaryExpr::createAnd( \ @@ -542,7 +506,7 @@ void AMDGPUMCKernelCodeT::EmitKernelCodeT(MCStreamer &OS, MCContext &Ctx) { const MCExpr *CodeProps = MCConstantExpr::create(code_properties, Ctx); CodeProps = MCBinaryExpr::createOr( CodeProps, - MaskShiftSet(is_dynamic_callstack, + maskShiftSet(is_dynamic_callstack, (1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1, AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, Ctx), Ctx); diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt index 2f4ce8eaf1d6..09b8da9f5dd4 100644 --- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_component_library(LLVMAMDGPUUtils AMDGPUAsmUtils.cpp AMDGPUBaseInfo.cpp + AMDGPUDelayedMCExpr.cpp AMDGPUMemoryUtils.cpp AMDGPUPALMetadata.cpp AMDKernelCodeTUtils.cpp diff --git a/llvm/lib/Target/AMDGPU/Utils/SIDefinesUtils.h b/llvm/lib/Target/AMDGPU/Utils/SIDefinesUtils.h new file mode 100644 index 000000000000..64d21de12c26 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/Utils/SIDefinesUtils.h @@ -0,0 +1,79 @@ +//===-- SIDefines.h - SI Helper Functions -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +/// \file - utility functions for the SIDefines and its common uses. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_SIDEFINESUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_SIDEFINESUTILS_H + +#include "llvm/MC/MCExpr.h" +#include <utility> + +namespace llvm { +class MCContext; +namespace AMDGPU { + +/// Deduce the least significant bit aligned shift and mask values for a binary +/// Complement \p Value (as they're defined in SIDefines.h as C_*) as a returned +/// pair<shift, mask>. That is to say \p Value == ~(mask << shift) +/// +/// For example, given C_00B848_FWD_PROGRESS (i.e., 0x7FFFFFFF) from +/// SIDefines.h, this will return the pair as (31,1). +constexpr std::pair<unsigned, unsigned> getShiftMask(unsigned Value) { + unsigned Shift = 0; + unsigned Mask = 0; + + Mask = ~Value; + for (; !(Mask & 1); Shift++, Mask >>= 1) { + } + + return std::make_pair(Shift, Mask); +} + +/// Provided with the MCExpr * \p Val, uint32 \p Mask and \p Shift, will return +/// the masked and left shifted, in said order of operations, MCExpr * created +/// within the MCContext \p Ctx. +/// +/// For example, given MCExpr *Val, Mask == 0xf, Shift == 6 the returned MCExpr +/// * will be the equivalent of (Val & 0xf) << 6 +inline const MCExpr *maskShiftSet(const MCExpr *Val, uint32_t Mask, + uint32_t Shift, MCContext &Ctx) { + if (Mask) { + const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx); + Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx); + } + if (Shift) { + const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx); + Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx); + } + return Val; +} + +/// Provided with the MCExpr * \p Val, uint32 \p Mask and \p Shift, will return +/// the right shifted and masked, in said order of operations, MCExpr * created +/// within the MCContext \p Ctx. +/// +/// For example, given MCExpr *Val, Mask == 0xf, Shift == 6 the returned MCExpr +/// * will be the equivalent of (Val >> 6) & 0xf +inline const MCExpr *maskShiftGet(const MCExpr *Val, uint32_t Mask, + uint32_t Shift, MCContext &Ctx) { + if (Shift) { + const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx); + Val = MCBinaryExpr::createLShr(Val, ShiftExpr, Ctx); + } + if (Mask) { + const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx); + Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx); + } + return Val; +} + +} // end namespace AMDGPU +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_SIDEFINESUTILS_H diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index b96c41c1e12a..2c0d61ee4afa 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -112,7 +112,7 @@ class getVOP1Pat <SDPatternOperator node, VOPProfile P> : LetDummies { !if(P.HasOMod, [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$vdst, (node P.Src0RC32:$src0))] + [(set P.DstVT:$vdst, (node (P.Src0VT P.Src0RC32:$src0)))] ) ); } @@ -249,9 +249,15 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> { // FIXME: Specify SchedRW for READFIRSTLANE_B32 // TODO: There is VOP3 encoding also def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE, - getVOP1Pat<int_amdgcn_readfirstlane, - VOP_READFIRSTLANE>.ret, 1> { + [], 1> { let isConvergent = 1; + let IsInvalidSingleUseConsumer = 1; +} + +foreach vt = Reg32Types.types in { + def : GCNPat<(vt (int_amdgcn_readfirstlane (vt VRegOrLdsSrc_32:$src0))), + (V_READFIRSTLANE_B32 (vt VRegOrLdsSrc_32:$src0)) + >; } let isReMaterializable = 1 in { @@ -362,6 +368,7 @@ defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>; def VOP_MOVRELS : VOPProfile<[i32, i32, untyped, untyped]> { let Src0RC32 = VRegSrc_32; let Src0RC64 = VRegSrc_32; + let IsInvalidSingleUseConsumer = 1; } // Special case because there are no true output operands. Hack vdst @@ -405,8 +412,12 @@ class VOP_MOVREL<RegisterOperand Src1RC> : VOPProfile<[untyped, i32, untyped, un let EmitDst = 1; // force vdst emission } -def VOP_MOVRELD : VOP_MOVREL<VSrc_b32>; -def VOP_MOVRELSD : VOP_MOVREL<VRegSrc_32>; +let IsInvalidSingleUseProducer = 1 in { + def VOP_MOVRELD : VOP_MOVREL<VSrc_b32>; + def VOP_MOVRELSD : VOP_MOVREL<VRegSrc_32> { + let IsInvalidSingleUseConsumer = 1; + } +} let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in { // v_movreld_b32 is a special case because the destination output @@ -535,6 +546,7 @@ let SubtargetPredicate = isGFX9Plus in { let Constraints = "$vdst = $src1, $vdst1 = $src0"; let DisableEncoding = "$vdst1,$src1"; let SchedRW = [Write64Bit, Write64Bit]; + let IsInvalidSingleUseConsumer = 1; } let isReMaterializable = 1 in @@ -699,6 +711,8 @@ let SubtargetPredicate = isGFX10Plus in { let Constraints = "$vdst = $src1, $vdst1 = $src0"; let DisableEncoding = "$vdst1,$src1"; let SchedRW = [Write64Bit, Write64Bit]; + let IsInvalidSingleUseConsumer = 1; + let IsInvalidSingleUseProducer = 1; } } // End Uses = [M0] } // End SubtargetPredicate = isGFX10Plus @@ -718,15 +732,22 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1 let SubtargetPredicate = isGFX11Plus in { // Restrict src0 to be VGPR def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS, - getVOP1Pat<int_amdgcn_permlane64, - VOP_MOVRELS>.ret, - /*VOP1Only=*/ 1>; + [], /*VOP1Only=*/ 1> { + let IsInvalidSingleUseConsumer = 1; + let IsInvalidSingleUseProducer = 1; + } defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>; defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>; defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>; defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>; } // End SubtargetPredicate = isGFX11Plus +foreach vt = Reg32Types.types in { + def : GCNPat<(int_amdgcn_permlane64 (vt VRegSrc_32:$src0)), + (vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0))) + >; +} + //===----------------------------------------------------------------------===// // Target-specific instruction encodings. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index ccb5b33dbdc4..9989752c2f6b 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -779,15 +779,25 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, } // End isCommutable = 1 // These are special and do not read the exec mask. -let isConvergent = 1, Uses = []<Register> in { -def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, - [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>; +let isConvergent = 1, Uses = []<Register>, IsInvalidSingleUseConsumer = 1 in { +def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, []>; let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { -def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, - [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>; +def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []> { + let IsInvalidSingleUseProducer = 1; + } } // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in } // End isConvergent = 1 +foreach vt = Reg32Types.types in { + def : GCNPat<(vt (int_amdgcn_readlane vt:$src0, i32:$src1)), + (V_READLANE_B32 VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1) + >; + + def : GCNPat<(vt (int_amdgcn_writelane vt:$src0, i32:$src1, vt:$src2)), + (V_WRITELANE_B32 SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$src2) + >; +} + let isReMaterializable = 1 in { defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>; defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32, add_ctpop>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 479c0aaf0174..efa8e9c74d44 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -13,9 +13,11 @@ def VOP_F32_F32_F32_F32_VCC : VOPProfile<[f32, f32, f32, f32]> { let Outs64 = (outs DstRC.RegClass:$vdst); let HasExtVOP3DPP = 0; let HasExtDPP = 0; + let IsSingle = 1; } def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> { let Outs64 = (outs DstRC.RegClass:$vdst); + let IsSingle = 1; } } @@ -105,7 +107,7 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod, } class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> { - + let IsSingle = 1; let HasOMod = !ne(DstVT.Value, f16.Value); let HasHigh = 1; @@ -155,12 +157,12 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_l } // End SubtargetPredicate = isNotGFX12Plus } // End SchedRW = [WriteDoubleAdd] -let SchedRW = [WriteIntMul] in { +let SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1 in { defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", V_MUL_PROF<VOP_I32_I32_I32>, DivergentBinFrag<mul>>; defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF<VOP_I32_I32_I32>, mulhu>; defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>; defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs>; -} // End SchedRW = [WriteIntMul] +} // End SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1 let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in { defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>; @@ -258,9 +260,9 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d let isReMaterializable = 1 in defm V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; -let Constraints = "@earlyclobber $vdst" in { +let Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1 in { defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>; -} // End Constraints = "@earlyclobber $vdst" +} // End Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1 let isReMaterializable = 1 in { @@ -275,14 +277,16 @@ let SchedRW = [Write64Bit] in { defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, csra_64>; } // End SubtargetPredicate = isGFX6GFX7 + let IsInvalidSingleUseConsumer = 1 in { let SubtargetPredicate = isGFX8Plus in { defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshr_rev_64>; defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, cashr_rev_64>; - } // End SubtargetPredicate = isGFX8Plus + } // End SubtargetPredicate = isGFX8Plus, , IsInvalidSingleUseConsumer = 1 let SubtargetPredicate = isGFX8GFX9GFX10GFX11 in { defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshl_rev_64>; } // End SubtargetPredicate = isGFX8GFX9GFX10GFX11 + } // End IsInvalidSingleUseConsumer = 1 } // End SchedRW = [Write64Bit] } // End isReMaterializable = 1 @@ -307,14 +311,14 @@ def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> { let HasModifiers = 0; } -let SubtargetPredicate = isGFX7Plus in { +let SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1 in { let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in { defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>; defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>; } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] -} // End SubtargetPredicate = isGFX7Plus +} // End SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1 -let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in { +let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1 in { let SubtargetPredicate = isGFX7Plus, OtherPredicates = [HasNotMADIntraFwdBug] in { defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; @@ -324,7 +328,7 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in { defm V_MAD_U64_U32_gfx11 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; defm V_MAD_I64_I32_gfx11 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; } -} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] +} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1 let FPDPRounding = 1 in { @@ -838,9 +842,9 @@ def gi_opsel_i1timm : GICustomOperandRenderer<"renderOpSelTImm">, GISDNodeXFormEquiv<opsel_i1timm>; class PermlanePat<SDPatternOperator permlane, - Instruction inst> : GCNPat< - (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, - timm:$fi, timm:$bc), + Instruction inst, ValueType vt> : GCNPat< + (vt (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2, + timm:$fi, timm:$bc)), (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc), SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in) >; @@ -859,13 +863,15 @@ let SubtargetPredicate = isGFX10Plus in { } // End isCommutable = 1, isReMaterializable = 1 def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>; - let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { + let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in", IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1 in { defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>; defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>; - } // End $vdst = $vdst_in, DisableEncoding $vdst_in + } // End $vdst = $vdst_in, DisableEncoding $vdst_in, IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1 - def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64>; - def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64>; + foreach vt = Reg32Types.types in { + def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>; + def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>; + } defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>; defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>; @@ -1275,11 +1281,12 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" -defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>; - -let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in { - defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>; -} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) +let IsInvalidSingleUseConsumer = 1 in { + defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>; + let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1 in { + defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>; + } // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32: $src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1 +} // End IsInvalidSingleUseConsumer = 1 let SubtargetPredicate = isGFX10Before1030 in { defm V_MUL_LO_I32 : VOP3_Real_gfx10<0x16b>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 4c78bd94458d..4cab15435199 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -90,7 +90,7 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> { let isReMaterializable = 1 in { let isCommutable = 1 in { defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; -defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; +defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>, imad>; let FPDPRounding = 1 in { defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>; @@ -382,15 +382,19 @@ defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", AMDGPUfdot2, 1/*ExplicitClamp*/>; let OtherPredicates = [HasDot7Insts] in { -defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", - VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>; +let IsInvalidSingleUseConsumer = 1 in { + defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", + VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>; +} defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>; } // End OtherPredicates = [HasDot7Insts] let OtherPredicates = [HasDot1Insts] in { -defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", - VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>; +let IsInvalidSingleUseConsumer = 1 in { + defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", + VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>; +} defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>; } // End OtherPredicates = [HasDot1Insts] diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 372c4f533629..3bcee28a2cb7 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -435,8 +435,10 @@ multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL, multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>; -multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : - VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>; +let IsInvalidSingleUseConsumer = 1 in { + multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>; +} multiclass VOPCX_F16<string opName, string revOp = opName> { let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in { @@ -465,8 +467,10 @@ multiclass VOPCX_I16<string opName, string revOp = opName> { multiclass VOPCX_I32 <string opName, string revOp = opName> : VOPCX_Pseudos <opName, VOPC_I1_I32_I32, VOPC_I32_I32, COND_NULL, revOp>; -multiclass VOPCX_I64 <string opName, string revOp = opName> : - VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>; +let IsInvalidSingleUseConsumer = 1 in { + multiclass VOPCX_I64 <string opName, string revOp = opName> : + VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>; +} //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 5d1573d8dec1..2b05165cc94b 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -17,6 +17,8 @@ class LetDummies { bit isReMaterializable; bit isAsCheapAsAMove; bit FPDPRounding; + bit IsInvalidSingleUseConsumer; + bit IsInvalidSingleUseProducer; Predicate SubtargetPredicate; string Constraints; string DisableEncoding; @@ -81,6 +83,8 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins, string Mnemonic = opName; Instruction Opcode = !cast<Instruction>(NAME); bit IsTrue16 = P.IsTrue16; + bit IsInvalidSingleUseConsumer = P.IsInvalidSingleUseConsumer; + bit IsInvalidSingleUseProducer = P.IsInvalidSingleUseProducer; VOPProfile Pfl = P; string AsmOperands; @@ -175,6 +179,8 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> : class VOP_Real<VOP_Pseudo ps> { Instruction Opcode = !cast<Instruction>(NAME); bit IsSingle = ps.Pfl.IsSingle; + bit IsInvalidSingleUseConsumer = ps.Pfl.IsInvalidSingleUseConsumer; + bit IsInvalidSingleUseProducer = ps.Pfl.IsInvalidSingleUseProducer; } class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> : @@ -823,17 +829,11 @@ class VOP3P_DPPe_Common<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], dag Ins = P.InsDPP, string asmOps = P.AsmDPP> : - InstSI <P.OutsDPP, Ins, OpName#asmOps, pattern>, - VOP <OpName>, - SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE> { - - let isPseudo = 1; - let isCodeGenOnly = 1; + VOP_Pseudo<OpName, "_dpp", P, P.OutsDPP, Ins, asmOps, pattern> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; - let UseNamedOperandTable = 1; let VALU = 1; let DPP = 1; @@ -846,7 +846,6 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]); let isConvergent = 1; - string Mnemonic = OpName; string AsmOperands = asmOps; let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", ""); @@ -857,7 +856,8 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); let DecoderNamespace = "GFX8"; - VOPProfile Pfl = P; + let IsInvalidSingleUseConsumer = !not(VINTERP); + let IsInvalidSingleUseProducer = !not(VINTERP); } class VOP3_DPP_Pseudo <string OpName, VOPProfile P> : @@ -1725,3 +1725,12 @@ def VOPTrue16Table : GenericTable { let PrimaryKey = ["Opcode"]; let PrimaryKeyName = "getTrue16OpcodeHelper"; } + +def SingleUseExceptionTable : GenericTable { + let FilterClass = "VOP_Pseudo"; + let CppTypeName = "SingleUseExceptionInfo"; + let Fields = ["Opcode", "IsInvalidSingleUseConsumer", "IsInvalidSingleUseProducer"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getSingleUseExceptionHelper"; +} |
