summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp138
1 files changed, 80 insertions, 58 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index c501ebba0c7e..484861dcaac0 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -106,6 +106,7 @@ private:
bool IsLastUse = false;
bool IsCooperative = false;
+ // TODO: Should we assume Cooperative=true if no MMO is present?
SIMemOpInfo(
const GCNSubtarget &ST,
AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
@@ -299,6 +300,10 @@ protected:
bool enableNamedBit(const MachineBasicBlock::iterator MI,
AMDGPU::CPol::CPol Bit) const;
+ /// Check if any atomic operation on AS can affect memory accessible via the
+ /// global address space.
+ bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
+
public:
/// Create a cache control for the subtarget \p ST.
@@ -334,6 +339,11 @@ public:
bool IsNonTemporal,
bool IsLastUse = false) const = 0;
+ /// Add final touches to a `mayStore` instruction \p MI, which may be a
+ /// Store or RMW instruction.
+ /// FIXME: This takes a MI because iterators aren't handled properly. When
+ /// this is called, they often point to entirely different insts. Thus we back
+ /// up the inst early and pass it here instead.
virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
return false;
};
@@ -991,6 +1001,15 @@ bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
return true;
}
+bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
+ assert((!ST.hasGloballyAddressableScratch() ||
+ (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE ||
+ (AS & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) &&
+ "scratch instructions should already be replaced by flat "
+ "instructions if GloballyAddressableScratch is enabled");
+ return (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE;
+}
+
/* static */
std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
@@ -1016,7 +1035,7 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
@@ -1239,7 +1258,7 @@ bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
@@ -1299,7 +1318,7 @@ bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
@@ -1336,7 +1355,7 @@ bool SIGfx90ACacheControl::enableLoadCacheBypass(
assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
@@ -1378,7 +1397,7 @@ bool SIGfx90ACacheControl::enableRMWCacheBypass(
assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
@@ -1487,7 +1506,7 @@ bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
// Ensures that following loads will not see stale remote VMEM data or
@@ -1551,7 +1570,7 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
// Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
@@ -1594,7 +1613,7 @@ bool SIGfx940CacheControl::enableLoadCacheBypass(
assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
// Set SC bits to indicate system scope.
@@ -1638,7 +1657,7 @@ bool SIGfx940CacheControl::enableStoreCacheBypass(
assert(!MI->mayLoad() && MI->mayStore());
bool Changed = false;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
// Set SC bits to indicate system scope.
@@ -1678,7 +1697,7 @@ bool SIGfx940CacheControl::enableRMWCacheBypass(
assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
// Set SC1 bit to indicate system scope.
@@ -1756,7 +1775,7 @@ bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
// Ensures that following loads will not see stale remote VMEM data or
@@ -1840,7 +1859,7 @@ bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
// Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
@@ -1897,7 +1916,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
@@ -2129,7 +2148,7 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
@@ -2194,7 +2213,7 @@ bool SIGfx11CacheControl::enableLoadCacheBypass(
assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
@@ -2368,7 +2387,10 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// which shares the same L0.
//
// GFX12.5:
- // TODO DOCS
+ // CU$ has two ports. To ensure operations are visible at the workgroup
+ // level, we need to ensure all operations in this port have completed
+ // so the other SIMDs in the WG can see them. There is no ordering
+ // guarantee between the ports.
if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
LOADCnt |= true;
@@ -2462,7 +2484,7 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
/// memory.
/// Other address spaces do not have a cache.
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
+ if (!canAffectGlobalAddrSpace(AddrSpace))
return false;
AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
@@ -2483,8 +2505,7 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
// Otherwise in CU mode all waves of a work-group are on the same CU, and
// so the L0 does not need to be invalidated.
//
- // GFX12.5
- // TODO DOCS
+ // GFX12.5 has a shared WGP$, so no invalidates are required.
if (ST.isCuModeEnabled())
return false;
@@ -2514,6 +2535,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
Position Pos) const {
+ bool Changed = false;
+
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
@@ -2521,53 +2544,52 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
// writeback as all memory operations by the same thread are
// sequentially consistent, and no other thread can access scratch
// memory.
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
+ if (Pos == Position::AFTER)
+ ++MI;
- // Other address spaces do not have a cache.
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
- return false;
-
- if (Pos == Position::AFTER)
- ++MI;
-
- // global_wb is only necessary at system scope for GFX12.0,
- // they're also necessary at device scope for GFX12.5.
- //
- // Emitting it for lower scopes is a slow no-op, so we omit it
- // for performance.
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
- .addImm(AMDGPU::CPol::SCOPE_SYS);
- break;
- case SIAtomicScope::AGENT:
- // TODO DOCS
- if (ST.hasGFX1250Insts()) {
+ // global_wb is only necessary at system scope for GFX12.0,
+ // they're also necessary at device scope for GFX12.5 as stores
+ // cannot report completion earlier than L2.
+ //
+ // Emitting it for lower scopes is a slow no-op, so we omit it
+ // for performance.
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
- .addImm(AMDGPU::CPol::SCOPE_DEV);
+ .addImm(AMDGPU::CPol::SCOPE_SYS);
+ Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ // GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
+ if (ST.hasGFX1250Insts()) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
+ .addImm(AMDGPU::CPol::SCOPE_DEV);
+ Changed = true;
+ }
+ break;
+ case SIAtomicScope::CLUSTER:
+ case SIAtomicScope::WORKGROUP:
+ // No WB necessary, but we still have to wait.
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No WB or wait necessary here, but insertWait takes care of that.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
}
- break;
- case SIAtomicScope::CLUSTER:
- case SIAtomicScope::WORKGROUP:
- // No WB necessary, but we still have to wait.
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No WB or wait necessary here.
- return false;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- if (Pos == Position::AFTER)
- --MI;
+ if (Pos == Position::AFTER)
+ --MI;
+ }
// We always have to wait for previous memory operations (load/store) to
// complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
// we of course need to wait for that as well.
- insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+ Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
- return true;
+ return Changed;
}
bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
@@ -2655,7 +2677,7 @@ bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace) const {
bool Changed = false;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);