diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 138 |
1 files changed, 80 insertions, 58 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index c501ebba0c7e..484861dcaac0 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -106,6 +106,7 @@ private: bool IsLastUse = false; bool IsCooperative = false; + // TODO: Should we assume Cooperative=true if no MMO is present? SIMemOpInfo( const GCNSubtarget &ST, AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, @@ -299,6 +300,10 @@ protected: bool enableNamedBit(const MachineBasicBlock::iterator MI, AMDGPU::CPol::CPol Bit) const; + /// Check if any atomic operation on AS can affect memory accessible via the + /// global address space. + bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const; + public: /// Create a cache control for the subtarget \p ST. @@ -334,6 +339,11 @@ public: bool IsNonTemporal, bool IsLastUse = false) const = 0; + /// Add final touches to a `mayStore` instruction \p MI, which may be a + /// Store or RMW instruction. + /// FIXME: This takes a MI because iterators aren't handled properly. When + /// this is called, they often point to entirely different insts. Thus we back + /// up the inst early and pass it here instead. virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const { return false; }; @@ -991,6 +1001,15 @@ bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, return true; } +bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const { + assert((!ST.hasGloballyAddressableScratch() || + (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE || + (AS & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) && + "scratch instructions should already be replaced by flat " + "instructions if GloballyAddressableScratch is enabled"); + return (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE; +} + /* static */ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); @@ -1016,7 +1035,7 @@ bool SIGfx6CacheControl::enableLoadCacheBypass( assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -1239,7 +1258,7 @@ bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -1299,7 +1318,7 @@ bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -1336,7 +1355,7 @@ bool SIGfx90ACacheControl::enableLoadCacheBypass( assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -1378,7 +1397,7 @@ bool SIGfx90ACacheControl::enableRMWCacheBypass( assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -1487,7 +1506,7 @@ bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: // Ensures that following loads will not see stale remote VMEM data or @@ -1551,7 +1570,7 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the @@ -1594,7 +1613,7 @@ bool SIGfx940CacheControl::enableLoadCacheBypass( assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: // Set SC bits to indicate system scope. @@ -1638,7 +1657,7 @@ bool SIGfx940CacheControl::enableStoreCacheBypass( assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: // Set SC bits to indicate system scope. @@ -1678,7 +1697,7 @@ bool SIGfx940CacheControl::enableRMWCacheBypass( assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: // Set SC1 bit to indicate system scope. @@ -1756,7 +1775,7 @@ bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: // Ensures that following loads will not see stale remote VMEM data or @@ -1840,7 +1859,7 @@ bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the @@ -1897,7 +1916,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass( assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -2129,7 +2148,7 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -2194,7 +2213,7 @@ bool SIGfx11CacheControl::enableLoadCacheBypass( assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -2368,7 +2387,10 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // which shares the same L0. // // GFX12.5: - // TODO DOCS + // CU$ has two ports. To ensure operations are visible at the workgroup + // level, we need to ensure all operations in this port have completed + // so the other SIMDs in the WG can see them. There is no ordering + // guarantee between the ports. if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; @@ -2462,7 +2484,7 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, /// memory. /// Other address spaces do not have a cache. - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) + if (!canAffectGlobalAddrSpace(AddrSpace)) return false; AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; @@ -2483,8 +2505,7 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, // Otherwise in CU mode all waves of a work-group are on the same CU, and // so the L0 does not need to be invalidated. // - // GFX12.5 - // TODO DOCS + // GFX12.5 has a shared WGP$, so no invalidates are required. if (ST.isCuModeEnabled()) return false; @@ -2514,6 +2535,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const { + bool Changed = false; + MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); @@ -2521,53 +2544,52 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, // writeback as all memory operations by the same thread are // sequentially consistent, and no other thread can access scratch // memory. + if (canAffectGlobalAddrSpace(AddrSpace)) { + if (Pos == Position::AFTER) + ++MI; - // Other address spaces do not have a cache. - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) - return false; - - if (Pos == Position::AFTER) - ++MI; - - // global_wb is only necessary at system scope for GFX12.0, - // they're also necessary at device scope for GFX12.5. - // - // Emitting it for lower scopes is a slow no-op, so we omit it - // for performance. - switch (Scope) { - case SIAtomicScope::SYSTEM: - BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) - .addImm(AMDGPU::CPol::SCOPE_SYS); - break; - case SIAtomicScope::AGENT: - // TODO DOCS - if (ST.hasGFX1250Insts()) { + // global_wb is only necessary at system scope for GFX12.0, + // they're also necessary at device scope for GFX12.5 as stores + // cannot report completion earlier than L2. + // + // Emitting it for lower scopes is a slow no-op, so we omit it + // for performance. + switch (Scope) { + case SIAtomicScope::SYSTEM: BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) - .addImm(AMDGPU::CPol::SCOPE_DEV); + .addImm(AMDGPU::CPol::SCOPE_SYS); + Changed = true; + break; + case SIAtomicScope::AGENT: + // GFX12.5 may have >1 L2 per device so we must emit a device scope WB. + if (ST.hasGFX1250Insts()) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) + .addImm(AMDGPU::CPol::SCOPE_DEV); + Changed = true; + } + break; + case SIAtomicScope::CLUSTER: + case SIAtomicScope::WORKGROUP: + // No WB necessary, but we still have to wait. + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No WB or wait necessary here, but insertWait takes care of that. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); } - break; - case SIAtomicScope::CLUSTER: - case SIAtomicScope::WORKGROUP: - // No WB necessary, but we still have to wait. - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No WB or wait necessary here. - return false; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - if (Pos == Position::AFTER) - --MI; + if (Pos == Position::AFTER) + --MI; + } // We always have to wait for previous memory operations (load/store) to // complete, whether we inserted a WB or not. If we inserted a WB (storecnt), // we of course need to wait for that as well. - insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); + Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); - return true; + return Changed; } bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( @@ -2655,7 +2677,7 @@ bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace) const { bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); |
