diff options
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 34 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 7 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 59 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 39 |
4 files changed, 138 insertions, 1 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 254b75b784e7..ae553da22e50 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1150,6 +1150,40 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() { GCNSchedStage::finalizeGCNSchedStage(); } +bool ILPInitialScheduleStage::initGCNSchedStage() { + if (!GCNSchedStage::initGCNSchedStage()) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + OriginalLoadLatencyScaleFactor = TII->getLoadLatencyScaleFactor(); + OriginalDSReadLatencyScaleFactor = TII->getDSReadLatencyScaleFactor(); + OriginalVMEMLoadLatencyScaleFactor = TII->getVMEMLoadLatencyScaleFactor(); + const unsigned ILPLoadLatencyScaleFactorDefault = 300; + if (ILPLoadLatencyScaleFactorDefault > TII->getLoadLatencyScaleFactor()) + TII->setLoadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault); + if (ILPLoadLatencyScaleFactorDefault > TII->getDSReadLatencyScaleFactor()) + TII->setDSReadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault); + if (ILPLoadLatencyScaleFactorDefault > TII->getVMEMLoadLatencyScaleFactor()) + TII->setVMEMLoadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault); + + LLVM_DEBUG(dbgs() << "ILP Initial Schedule: Set load latency scale factor to " + << TII->getLoadLatencyScaleFactor() << '\n'); + return true; +} + +void ILPInitialScheduleStage::finalizeGCNSchedStage() { + const SIInstrInfo *TII = ST.getInstrInfo(); + TII->setLoadLatencyScaleFactor(OriginalLoadLatencyScaleFactor); + TII->setDSReadLatencyScaleFactor(OriginalDSReadLatencyScaleFactor); + TII->setVMEMLoadLatencyScaleFactor(OriginalVMEMLoadLatencyScaleFactor); + + LLVM_DEBUG( + dbgs() << "ILP Initial Schedule: Restored load latency scale factor to " + << OriginalLoadLatencyScaleFactor << "\n"); + + GCNSchedStage::finalizeGCNSchedStage(); +} + bool GCNSchedStage::initGCNRegion() { // Check whether this new region is also a new block. if (DAG.RegionBegin->getParent() != CurrentMBB) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 790370ff8ab4..5be6d4bde6cd 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -515,8 +515,15 @@ public: }; class ILPInitialScheduleStage : public GCNSchedStage { +private: + unsigned OriginalLoadLatencyScaleFactor = 0; + unsigned OriginalDSReadLatencyScaleFactor = 0; + unsigned OriginalVMEMLoadLatencyScaleFactor = 0; + public: bool shouldRevertScheduling(unsigned WavesAfter) override; + bool initGCNSchedStage() override; + void finalizeGCNSchedStage() override; ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) : GCNSchedStage(StageID, DAG) {} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5106478a95b4..a35aabd405a1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -62,9 +62,29 @@ static cl::opt<bool> Fix16BitCopies( cl::init(true), cl::ReallyHidden); +static cl::opt<unsigned> AMDGPULoadLatencyScaleFactor( + "amdgpu-load-latency-scale-factor", + cl::desc("Scale factor for load instruction latency. Final latency is " + "scalled by `Factor / 100 * Latency`."), + cl::init(100), cl::ReallyHidden); + +static cl::opt<unsigned> AMDGPUDSReadLatencyScaleFactor( + "amdgpu-ds-read-latency-scale-factor", + cl::desc("Scale factor for LDS (DS) read instruction latency. Final " + "latency is scaled by `Factor / 100 * Latency`."), + cl::init(100), cl::ReallyHidden); + +static cl::opt<unsigned> AMDGPUVMEMLoadLatencyScaleFactor( + "amdgpu-vmem-load-latency-scale-factor", + cl::desc("Scale factor for VMEM/BUFFER/FLAT load instruction latency. " + "Final latency is scaled by `Factor / 100 * Latency`."), + cl::init(100), cl::ReallyHidden); + SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), - RI(ST), ST(ST) { + RI(ST), ST(ST), LoadLatencyScaleFactor(AMDGPULoadLatencyScaleFactor), + DSReadLatencyScaleFactor(AMDGPUDSReadLatencyScaleFactor), + VMEMLoadLatencyScaleFactor(AMDGPUVMEMLoadLatencyScaleFactor) { SchedModel.init(&ST); } @@ -10240,6 +10260,43 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, return SchedModel.computeInstrLatency(&MI); } +std::optional<unsigned> +SIInstrInfo::getInstrLatency(const TargetSchedModel &TargetSchedModel, + const MachineInstr &MI) const { + auto LatencyOpt = TargetInstrInfo::getInstrLatency(TargetSchedModel, MI); + if (!LatencyOpt) + return std::nullopt; + unsigned Latency = *LatencyOpt; + if (MI.mayLoad()) { + unsigned Scale = LoadLatencyScaleFactor; + if (isDS(MI)) + Scale = DSReadLatencyScaleFactor; + else if (isVMEM(MI) || isFLAT(MI)) + Scale = VMEMLoadLatencyScaleFactor; + Latency = (Latency * Scale) / 100; + } + return Latency; +} + +std::optional<unsigned> SIInstrInfo::getOperandLatency( + const TargetSchedModel &SchedModel, const MachineInstr *DefMI, + unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const { + auto LatOpt = TargetInstrInfo::getOperandLatency( + SchedModel, DefMI, DefOperIdx, UseMI, UseOperIdx); + if (!LatOpt) + return std::nullopt; + unsigned Latency = *LatOpt; + if (DefMI && DefMI->mayLoad()) { + unsigned Scale = LoadLatencyScaleFactor; + if (isDS(*DefMI)) + Scale = DSReadLatencyScaleFactor; + else if (isVMEM(*DefMI) || isFLAT(*DefMI)) + Scale = VMEMLoadLatencyScaleFactor; + Latency = (Latency * Scale) / 100; + } + return Latency; +} + InstructionUniformity SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index dffb3d7459e6..e01e3030b100 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -93,6 +93,13 @@ private: const GCNSubtarget &ST; TargetSchedModel SchedModel; mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter; + // Final load latency in the machine model is scalled by + // `Factor / 100 * Latency` + mutable unsigned LoadLatencyScaleFactor = 100; + // Separate scale factor for LDS (DS) read operations. + mutable unsigned DSReadLatencyScaleFactor = 100; + // Separate scale factor for VMEM/BUFFER/FLAT loads. + mutable unsigned VMEMLoadLatencyScaleFactor = 100; // The inverse predicate should have the negative value. enum BranchPredicate { @@ -111,6 +118,38 @@ private: static BranchPredicate getBranchPredicate(unsigned Opcode); public: + void setLoadLatencyScaleFactor(unsigned Factor) const { + LoadLatencyScaleFactor = Factor; + } + + unsigned getLoadLatencyScaleFactor() const { return LoadLatencyScaleFactor; } + + // Control DS read (LDS) latency scaling independently when desired. + void setDSReadLatencyScaleFactor(unsigned Factor) const { + DSReadLatencyScaleFactor = Factor; + } + unsigned getDSReadLatencyScaleFactor() const { + return DSReadLatencyScaleFactor; + } + + // Control VMEM/BUFFER/FLAT load latency scaling independently. + void setVMEMLoadLatencyScaleFactor(unsigned Factor) const { + VMEMLoadLatencyScaleFactor = Factor; + } + unsigned getVMEMLoadLatencyScaleFactor() const { + return VMEMLoadLatencyScaleFactor; + } + + // TargetSchedModel latency hooks. + std::optional<unsigned> + getInstrLatency(const TargetSchedModel &TargetSchedModel, + const MachineInstr &MI) const override; + std::optional<unsigned> getOperandLatency(const TargetSchedModel &SchedModel, + const MachineInstr *DefMI, + unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const override; + unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, |
