diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 761 |
1 files changed, 0 insertions, 761 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 2e1bdf469247..67d8715d3f1c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -17,7 +17,6 @@ #include "AMDGPULegalizerInfo.h" #include "AMDGPURegisterBankInfo.h" #include "AMDGPUTargetMachine.h" -#include "GCNSubtarget.h" #include "R600Subtarget.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" @@ -36,308 +35,12 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-subtarget" -#define GET_SUBTARGETINFO_TARGET_DESC -#define GET_SUBTARGETINFO_CTOR -#define AMDGPUSubtarget GCNSubtarget -#include "AMDGPUGenSubtargetInfo.inc" -#undef AMDGPUSubtarget - -static cl::opt<bool> EnablePowerSched( - "amdgpu-enable-power-sched", - cl::desc("Enable scheduling to minimize mAI power bursts"), - cl::init(false)); - -static cl::opt<bool> EnableVGPRIndexMode( - "amdgpu-vgpr-index-mode", - cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), - cl::init(false)); - -static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", - cl::desc("Enable the use of AA during codegen."), - cl::init(true)); - -static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold", - cl::desc("Number of addresses from which to enable MIMG NSA."), - cl::init(3), cl::Hidden); - -GCNSubtarget::~GCNSubtarget() = default; - -GCNSubtarget & -GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, - StringRef GPU, StringRef FS) { - // Determine default and user-specified characteristics - // - // We want to be able to turn these off, but making this a subtarget feature - // for SI has the unhelpful behavior that it unsets everything else if you - // disable it. - // - // Similarly we want enable-prt-strict-null to be on by default and not to - // unset everything else if it is disabled - - SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); - - // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default - if (isAmdHsaOS()) - FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; - - FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS - - // Disable mutually exclusive bits. - if (FS.contains_insensitive("+wavefrontsize")) { - if (!FS.contains_insensitive("wavefrontsize16")) - FullFS += "-wavefrontsize16,"; - if (!FS.contains_insensitive("wavefrontsize32")) - FullFS += "-wavefrontsize32,"; - if (!FS.contains_insensitive("wavefrontsize64")) - FullFS += "-wavefrontsize64,"; - } - - FullFS += FS; - - ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); - - // Implement the "generic" processors, which acts as the default when no - // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to - // the first amdgcn target that supports flat addressing. Other OSes defaults - // to the first amdgcn target. - if (Gen == AMDGPUSubtarget::INVALID) { - Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS - : AMDGPUSubtarget::SOUTHERN_ISLANDS; - } - - if (!hasFeature(AMDGPU::FeatureWavefrontSize32) && - !hasFeature(AMDGPU::FeatureWavefrontSize64)) { - // If there is no default wave size it must be a generation before gfx10, - // these have FeatureWavefrontSize64 in their definition already. For gfx10+ - // set wave32 as a default. - ToggleFeature(AMDGPU::FeatureWavefrontSize32); - } - - // We don't support FP64 for EG/NI atm. - assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); - - // Targets must either support 64-bit offsets for MUBUF instructions, and/or - // support flat operations, otherwise they cannot access a 64-bit global - // address space - assert(hasAddr64() || hasFlat()); - // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets - // that do not support ADDR64 variants of MUBUF instructions. Such targets - // cannot use a 64 bit offset with a MUBUF instruction to access the global - // address space - if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { - ToggleFeature(AMDGPU::FeatureFlatForGlobal); - FlatForGlobal = true; - } - // Unless +-flat-for-global is specified, use MUBUF instructions for global - // address space access if flat operations are not available. - if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { - ToggleFeature(AMDGPU::FeatureFlatForGlobal); - FlatForGlobal = false; - } - - // Set defaults if needed. - if (MaxPrivateElementSize == 0) - MaxPrivateElementSize = 4; - - if (LDSBankCount == 0) - LDSBankCount = 32; - - if (TT.getArch() == Triple::amdgcn) { - if (LocalMemorySize == 0) - LocalMemorySize = 32768; - - // Do something sensible for unspecified target. - if (!HasMovrel && !HasVGPRIndexMode) - HasMovrel = true; - } - - AddressableLocalMemorySize = LocalMemorySize; - - if (AMDGPU::isGFX10Plus(*this) && - !getFeatureBits().test(AMDGPU::FeatureCuMode)) - LocalMemorySize *= 2; - - // Don't crash on invalid devices. - if (WavefrontSizeLog2 == 0) - WavefrontSizeLog2 = 5; - - HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; - HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; - - TargetID.setTargetIDFromFeaturesString(FS); - - LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " - << TargetID.getXnackSetting() << '\n'); - LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " - << TargetID.getSramEccSetting() << '\n'); - - return *this; -} - -void GCNSubtarget::checkSubtargetFeatures(const Function &F) const { - LLVMContext &Ctx = F.getContext(); - if (hasFeature(AMDGPU::FeatureWavefrontSize32) == - hasFeature(AMDGPU::FeatureWavefrontSize64)) { - Ctx.diagnose(DiagnosticInfoUnsupported( - F, "must specify exactly one of wavefrontsize32 and wavefrontsize64")); - } -} - AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {} bool AMDGPUSubtarget::useRealTrue16Insts() const { return hasTrue16BitInsts() && EnableRealTrue16Insts; } -GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, - const GCNTargetMachine &TM) - : // clang-format off - AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), - AMDGPUSubtarget(TT), - TargetTriple(TT), - TargetID(*this), - InstrItins(getInstrItineraryForCPU(GPU)), - InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), - TLInfo(TM, *this), - FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { - // clang-format on - MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); - EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this); - CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering()); - InlineAsmLoweringInfo = - std::make_unique<InlineAsmLowering>(getTargetLowering()); - Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM); - RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this); - InstSelector = - std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM); -} - -unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { - if (getGeneration() < GFX10) - return 1; - - switch (Opcode) { - case AMDGPU::V_LSHLREV_B64_e64: - case AMDGPU::V_LSHLREV_B64_gfx10: - case AMDGPU::V_LSHLREV_B64_e64_gfx11: - case AMDGPU::V_LSHLREV_B64_e32_gfx12: - case AMDGPU::V_LSHLREV_B64_e64_gfx12: - case AMDGPU::V_LSHL_B64_e64: - case AMDGPU::V_LSHRREV_B64_e64: - case AMDGPU::V_LSHRREV_B64_gfx10: - case AMDGPU::V_LSHRREV_B64_e64_gfx11: - case AMDGPU::V_LSHRREV_B64_e64_gfx12: - case AMDGPU::V_LSHR_B64_e64: - case AMDGPU::V_ASHRREV_I64_e64: - case AMDGPU::V_ASHRREV_I64_gfx10: - case AMDGPU::V_ASHRREV_I64_e64_gfx11: - case AMDGPU::V_ASHRREV_I64_e64_gfx12: - case AMDGPU::V_ASHR_I64_e64: - return 1; - } - - return 2; -} - -/// This list was mostly derived from experimentation. -bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { - switch (Opcode) { - case AMDGPU::V_CVT_F16_F32_e32: - case AMDGPU::V_CVT_F16_F32_e64: - case AMDGPU::V_CVT_F16_U16_e32: - case AMDGPU::V_CVT_F16_U16_e64: - case AMDGPU::V_CVT_F16_I16_e32: - case AMDGPU::V_CVT_F16_I16_e64: - case AMDGPU::V_RCP_F16_e64: - case AMDGPU::V_RCP_F16_e32: - case AMDGPU::V_RSQ_F16_e64: - case AMDGPU::V_RSQ_F16_e32: - case AMDGPU::V_SQRT_F16_e64: - case AMDGPU::V_SQRT_F16_e32: - case AMDGPU::V_LOG_F16_e64: - case AMDGPU::V_LOG_F16_e32: - case AMDGPU::V_EXP_F16_e64: - case AMDGPU::V_EXP_F16_e32: - case AMDGPU::V_SIN_F16_e64: - case AMDGPU::V_SIN_F16_e32: - case AMDGPU::V_COS_F16_e64: - case AMDGPU::V_COS_F16_e32: - case AMDGPU::V_FLOOR_F16_e64: - case AMDGPU::V_FLOOR_F16_e32: - case AMDGPU::V_CEIL_F16_e64: - case AMDGPU::V_CEIL_F16_e32: - case AMDGPU::V_TRUNC_F16_e64: - case AMDGPU::V_TRUNC_F16_e32: - case AMDGPU::V_RNDNE_F16_e64: - case AMDGPU::V_RNDNE_F16_e32: - case AMDGPU::V_FRACT_F16_e64: - case AMDGPU::V_FRACT_F16_e32: - case AMDGPU::V_FREXP_MANT_F16_e64: - case AMDGPU::V_FREXP_MANT_F16_e32: - case AMDGPU::V_FREXP_EXP_I16_F16_e64: - case AMDGPU::V_FREXP_EXP_I16_F16_e32: - case AMDGPU::V_LDEXP_F16_e64: - case AMDGPU::V_LDEXP_F16_e32: - case AMDGPU::V_LSHLREV_B16_e64: - case AMDGPU::V_LSHLREV_B16_e32: - case AMDGPU::V_LSHRREV_B16_e64: - case AMDGPU::V_LSHRREV_B16_e32: - case AMDGPU::V_ASHRREV_I16_e64: - case AMDGPU::V_ASHRREV_I16_e32: - case AMDGPU::V_ADD_U16_e64: - case AMDGPU::V_ADD_U16_e32: - case AMDGPU::V_SUB_U16_e64: - case AMDGPU::V_SUB_U16_e32: - case AMDGPU::V_SUBREV_U16_e64: - case AMDGPU::V_SUBREV_U16_e32: - case AMDGPU::V_MUL_LO_U16_e64: - case AMDGPU::V_MUL_LO_U16_e32: - case AMDGPU::V_ADD_F16_e64: - case AMDGPU::V_ADD_F16_e32: - case AMDGPU::V_SUB_F16_e64: - case AMDGPU::V_SUB_F16_e32: - case AMDGPU::V_SUBREV_F16_e64: - case AMDGPU::V_SUBREV_F16_e32: - case AMDGPU::V_MUL_F16_e64: - case AMDGPU::V_MUL_F16_e32: - case AMDGPU::V_MAX_F16_e64: - case AMDGPU::V_MAX_F16_e32: - case AMDGPU::V_MIN_F16_e64: - case AMDGPU::V_MIN_F16_e32: - case AMDGPU::V_MAX_U16_e64: - case AMDGPU::V_MAX_U16_e32: - case AMDGPU::V_MIN_U16_e64: - case AMDGPU::V_MIN_U16_e32: - case AMDGPU::V_MAX_I16_e64: - case AMDGPU::V_MAX_I16_e32: - case AMDGPU::V_MIN_I16_e64: - case AMDGPU::V_MIN_I16_e32: - case AMDGPU::V_MAD_F16_e64: - case AMDGPU::V_MAD_U16_e64: - case AMDGPU::V_MAD_I16_e64: - case AMDGPU::V_FMA_F16_e64: - case AMDGPU::V_DIV_FIXUP_F16_e64: - // On gfx10, all 16-bit instructions preserve the high bits. - return getGeneration() <= AMDGPUSubtarget::GFX9; - case AMDGPU::V_MADAK_F16: - case AMDGPU::V_MADMK_F16: - case AMDGPU::V_MAC_F16_e64: - case AMDGPU::V_MAC_F16_e32: - case AMDGPU::V_FMAMK_F16: - case AMDGPU::V_FMAAK_F16: - case AMDGPU::V_FMAC_F16_e64: - case AMDGPU::V_FMAC_F16_e32: - // In gfx9, the preferred handling of the unused high 16-bits changed. Most - // instructions maintain the legacy behavior of 0ing. Some instructions - // changed to preserving the high bits. - return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; - case AMDGPU::V_MAD_MIXLO_F16: - case AMDGPU::V_MAD_MIXHI_F16: - default: - return false; - } -} - // Returns the maximum per-workgroup LDS allocation size (in bytes) that still // allows the given function to achieve an occupancy of NWaves waves per // SIMD / EU, taking into account only the function's *maximum* workgroup size. @@ -650,391 +353,6 @@ AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { : AMDGPUDwarfFlavour::Wave64; } -void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const { - // Track register pressure so the scheduler can try to decrease - // pressure once register usage is above the threshold defined by - // SIRegisterInfo::getRegPressureSetLimit() - Policy.ShouldTrackPressure = true; - - // Enabling both top down and bottom up scheduling seems to give us less - // register spills than just using one of these approaches on its own. - Policy.OnlyTopDown = false; - Policy.OnlyBottomUp = false; - - // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. - if (!enableSIScheduler()) - Policy.ShouldTrackLaneMasks = true; -} - -void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const { - if (isWave32()) { - // Fix implicit $vcc operands after MIParser has verified that they match - // the instruction definitions. - for (auto &MBB : MF) { - for (auto &MI : MBB) - InstrInfo.fixImplicitOperands(MI); - } - } -} - -bool GCNSubtarget::hasMadF16() const { - return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; -} - -bool GCNSubtarget::useVGPRIndexMode() const { - return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); -} - -bool GCNSubtarget::useAA() const { return UseAA; } - -unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { - return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(), - getGeneration()); -} - -unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { - return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs); -} - -unsigned -GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { - if (getGeneration() >= AMDGPUSubtarget::GFX10) - return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. - - if (HasFlatScratch || HasArchitectedFlatScratch) { - if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). - if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) - return 4; // FLAT_SCRATCH, VCC (in that order). - } - - if (isXNACKEnabled()) - return 4; // XNACK, VCC (in that order). - return 2; // VCC. -} - -unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { - const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit()); -} - -unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { - // In principle we do not need to reserve SGPR pair used for flat_scratch if - // we know flat instructions do not access the stack anywhere in the - // program. For now assume it's needed if we have flat instructions. - const bool KernelUsesFlatScratch = hasFlatAddressSpace(); - return getBaseReservedNumSGPRs(KernelUsesFlatScratch); -} - -unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, - unsigned NumSGPRs, - unsigned NumVGPRs) const { - unsigned Occupancy = - std::min(getMaxWavesPerEU(), - getOccupancyWithLocalMemSize(LDSSize, F)); - if (NumSGPRs) - Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); - if (NumVGPRs) - Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); - return Occupancy; -} - -unsigned GCNSubtarget::getBaseMaxNumSGPRs( - const Function &F, std::pair<unsigned, unsigned> WavesPerEU, - unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { - // Compute maximum number of SGPRs function can use using default/requested - // minimum number of waves per execution unit. - unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); - unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); - - // Check if maximum number of SGPRs was explicitly requested using - // "amdgpu-num-sgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-sgpr")) { - unsigned Requested = - F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); - - // Make sure requested value does not violate subtarget's specifications. - if (Requested && (Requested <= ReservedNumSGPRs)) - Requested = 0; - - // If more SGPRs are required to support the input user/system SGPRs, - // increase to accommodate them. - // - // FIXME: This really ends up using the requested number of SGPRs + number - // of reserved special registers in total. Theoretically you could re-use - // the last input registers for these special registers, but this would - // require a lot of complexity to deal with the weird aliasing. - unsigned InputNumSGPRs = PreloadedSGPRs; - if (Requested && Requested < InputNumSGPRs) - Requested = InputNumSGPRs; - - // Make sure requested value is compatible with values implied by - // default/requested minimum/maximum number of waves per execution unit. - if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) - Requested = 0; - if (WavesPerEU.second && - Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) - Requested = 0; - - if (Requested) - MaxNumSGPRs = Requested; - } - - if (hasSGPRInitBug()) - MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; - - return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); -} - -unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { - const Function &F = MF.getFunction(); - const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), - getReservedNumSGPRs(MF)); -} - -static unsigned getMaxNumPreloadedSGPRs() { - using USI = GCNUserSGPRUsageInfo; - // Max number of user SGPRs - const unsigned MaxUserSGPRs = - USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) + - USI::getNumUserSGPRForField(USI::DispatchPtrID) + - USI::getNumUserSGPRForField(USI::QueuePtrID) + - USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) + - USI::getNumUserSGPRForField(USI::DispatchIdID) + - USI::getNumUserSGPRForField(USI::FlatScratchInitID) + - USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID); - - // Max number of system SGPRs - const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX - 1 + // WorkGroupIDY - 1 + // WorkGroupIDZ - 1 + // WorkGroupInfo - 1; // private segment wave byte offset - - // Max number of synthetic SGPRs - const unsigned SyntheticSGPRs = 1; // LDSKernelId - - return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; -} - -unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { - return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), - getReservedNumSGPRs(F)); -} - -unsigned GCNSubtarget::getBaseMaxNumVGPRs( - const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { - // Compute maximum number of VGPRs function can use using default/requested - // minimum number of waves per execution unit. - unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); - - // Check if maximum number of VGPRs was explicitly requested using - // "amdgpu-num-vgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-vgpr")) { - unsigned Requested = - F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); - - if (hasGFX90AInsts()) - Requested *= 2; - - // Make sure requested value is compatible with values implied by - // default/requested minimum/maximum number of waves per execution unit. - if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) - Requested = 0; - if (WavesPerEU.second && - Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) - Requested = 0; - - if (Requested) - MaxNumVGPRs = Requested; - } - - return MaxNumVGPRs; -} - -unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { - return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); -} - -unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { - const Function &F = MF.getFunction(); - const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); -} - -void GCNSubtarget::adjustSchedDependency( - SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, - const TargetSchedModel *SchedModel) const { - if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || - !Def->isInstr() || !Use->isInstr()) - return; - - MachineInstr *DefI = Def->getInstr(); - MachineInstr *UseI = Use->getInstr(); - - if (DefI->isBundle()) { - const SIRegisterInfo *TRI = getRegisterInfo(); - auto Reg = Dep.getReg(); - MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); - MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); - unsigned Lat = 0; - for (++I; I != E && I->isBundledWithPred(); ++I) { - if (I->modifiesRegister(Reg, TRI)) - Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); - else if (Lat) - --Lat; - } - Dep.setLatency(Lat); - } else if (UseI->isBundle()) { - const SIRegisterInfo *TRI = getRegisterInfo(); - auto Reg = Dep.getReg(); - MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); - MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); - unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); - for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { - if (I->readsRegister(Reg, TRI)) - break; - --Lat; - } - Dep.setLatency(Lat); - } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { - // Work around the fact that SIInstrInfo::fixImplicitOperands modifies - // implicit operands which come from the MCInstrDesc, which can fool - // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit - // pseudo operands. - Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( - DefI, DefOpIdx, UseI, UseOpIdx)); - } -} - -namespace { -struct FillMFMAShadowMutation : ScheduleDAGMutation { - const SIInstrInfo *TII; - - ScheduleDAGMI *DAG; - - FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} - - bool isSALU(const SUnit *SU) const { - const MachineInstr *MI = SU->getInstr(); - return MI && TII->isSALU(*MI) && !MI->isTerminator(); - } - - bool isVALU(const SUnit *SU) const { - const MachineInstr *MI = SU->getInstr(); - return MI && TII->isVALU(*MI); - } - - // Link as many SALU instructions in chain as possible. Return the size - // of the chain. Links up to MaxChain instructions. - unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, - SmallPtrSetImpl<SUnit *> &Visited) const { - SmallVector<SUnit *, 8> Worklist({To}); - unsigned Linked = 0; - - while (!Worklist.empty() && MaxChain-- > 0) { - SUnit *SU = Worklist.pop_back_val(); - if (!Visited.insert(SU).second) - continue; - - LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); - dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); - - if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From)) - if (DAG->addEdge(SU, SDep(From, SDep::Artificial))) - ++Linked; - - for (SDep &SI : From->Succs) { - SUnit *SUv = SI.getSUnit(); - if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) && - DAG->canAddEdge(SUv, SU)) - DAG->addEdge(SUv, SDep(SU, SDep::Artificial)); - } - - for (SDep &SI : SU->Succs) { - SUnit *Succ = SI.getSUnit(); - if (Succ != SU && isSALU(Succ)) - Worklist.push_back(Succ); - } - } - - return Linked; - } - - void apply(ScheduleDAGInstrs *DAGInstrs) override { - const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); - if (!ST.hasMAIInsts()) - return; - DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); - const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); - if (!TSchedModel || DAG->SUnits.empty()) - return; - - // Scan for MFMA long latency instructions and try to add a dependency - // of available SALU instructions to give them a chance to fill MFMA - // shadow. That is desirable to fill MFMA shadow with SALU instructions - // rather than VALU to prevent power consumption bursts and throttle. - auto LastSALU = DAG->SUnits.begin(); - auto E = DAG->SUnits.end(); - SmallPtrSet<SUnit*, 32> Visited; - for (SUnit &SU : DAG->SUnits) { - MachineInstr &MAI = *SU.getInstr(); - if (!TII->isMAI(MAI) || - MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || - MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) - continue; - - unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; - - LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); - dbgs() << "Need " << Lat - << " instructions to cover latency.\n"); - - // Find up to Lat independent scalar instructions as early as - // possible such that they can be scheduled after this MFMA. - for ( ; Lat && LastSALU != E; ++LastSALU) { - if (Visited.count(&*LastSALU)) - continue; - - if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) || - !DAG->canAddEdge(&*LastSALU, &SU)) - continue; - - Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); - } - } - } -}; -} // namespace - -void GCNSubtarget::getPostRAMutations( - std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { - Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); -} - -std::unique_ptr<ScheduleDAGMutation> -GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { - return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo) - : nullptr; -} - -unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { - if (getGeneration() >= AMDGPUSubtarget::GFX12) - return 0; // Not MIMG encoding. - - if (NSAThreshold.getNumOccurrences() > 0) - return std::max(NSAThreshold.getValue(), 2u); - - int Value = MF.getFunction().getFnAttributeAsParsedInteger( - "amdgpu-nsa-threshold", -1); - if (Value > 0) - return std::max(Value, 2); - - return 3; -} - const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); @@ -1048,85 +366,6 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct TM.getSubtarget<R600Subtarget>(F)); } -GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, - const GCNSubtarget &ST) - : ST(ST) { - const CallingConv::ID CC = F.getCallingConv(); - const bool IsKernel = - CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; - // FIXME: Should have analysis or something rather than attribute to detect - // calls. - const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); - // FIXME: This attribute is a hack, we just need an analysis on the function - // to look for allocas. - const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); - - if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) - KernargSegmentPtr = true; - - bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); - if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) - PrivateSegmentBuffer = true; - else if (ST.isMesaGfxShader(F)) - ImplicitBufferPtr = true; - - if (!AMDGPU::isGraphics(CC)) { - if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) - DispatchPtr = true; - - // FIXME: Can this always be disabled with < COv5? - if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) - QueuePtr = true; - - if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) - DispatchID = true; - } - - // TODO: This could be refined a lot. The attribute is a poor way of - // detecting calls or stack objects that may require it before argument - // lowering. - if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && - (IsAmdHsaOrMesa || ST.enableFlatScratch()) && - (HasCalls || HasStackObjects || ST.enableFlatScratch()) && - !ST.flatScratchIsArchitected()) { - FlatScratchInit = true; - } - - if (hasImplicitBufferPtr()) - NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID); - - if (hasPrivateSegmentBuffer()) - NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID); - - if (hasDispatchPtr()) - NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID); - - if (hasQueuePtr()) - NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID); - - if (hasKernargSegmentPtr()) - NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID); - - if (hasDispatchID()) - NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID); - - if (hasFlatScratchInit()) - NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); - - if (hasPrivateSegmentSize()) - NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID); -} - -void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { - assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); - NumKernargPreloadSGPRs += NumSGPRs; - NumUsedUserSGPRs += NumSGPRs; -} - -unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { - return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs; -} - SmallVector<unsigned> AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const { return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3); |
