[AMDGPU] Cleanup memory legalizer interfaces

- Rename interfaces to be in terms of acquire and release.
- Improve comments.

Differential Revision: https://reviews.llvm.org/D89355
This commit is contained in:
Tony 2020-10-13 02:06:33 +00:00
parent 4b6e3ad9b4
commit 907d799070
1 changed files with 124 additions and 98 deletions

View File

@ -280,23 +280,23 @@ public:
virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
const = 0;
/// Inserts any necessary instructions at position \p Pos relative
/// to instruction \p MI to ensure any caches associated with
/// address spaces \p AddrSpace for memory scopes up to memory scope
/// \p Scope are invalidated. Returns true iff any instructions
/// inserted.
virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const = 0;
/// Inserts any necessary instructions at position \p Pos relative to
/// instruction \p MI to ensure any subsequent memory instructions of this
/// thread with address spaces \p AddrSpace will observe the previous memory
/// operations by any thread for memory scopes up to memory scope \p Scope .
/// Returns true iff any instructions inserted.
virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const = 0;
/// Inserts any necessary instructions at position \p Pos relative
/// to instruction \p MI to ensure memory instructions of kind \p Op
/// associated with address spaces \p AddrSpace have completed as
/// observed by other memory instructions executing in memory scope
/// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
/// ordering is between address spaces. Returns true iff any
/// instructions inserted.
/// to instruction \p MI to ensure memory instructions before \p Pos of kind
/// \p Op associated with address spaces \p AddrSpace have completed. Used
/// between memory instructions to enforce the order they become visible as
/// observed by other memory instructions executing in memory scope \p Scope.
/// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
/// address spaces. Returns true iff any instructions inserted.
virtual bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
@ -304,6 +304,18 @@ public:
bool IsCrossAddrSpaceOrdering,
Position Pos) const = 0;
/// Inserts any necessary instructions at position \p Pos relative to
/// instruction \p MI to ensure previous memory instructions by this thread
/// with address spaces \p AddrSpace have completed and can be observed by
/// subsequent memory instructions by any thread executing in memory scope \p
/// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
/// between address spaces. Returns true iff any instructions inserted.
virtual bool insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
Position Pos) const = 0;
/// Virtual destructor to allow derivations to be deleted.
virtual ~SICacheControl() = default;
@ -334,10 +346,16 @@ public:
bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const override;
bool insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
Position Pos) const override;
bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@ -352,10 +370,10 @@ public:
SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const override;
};
@ -380,10 +398,10 @@ public:
bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const override;
bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@ -678,9 +696,6 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
bool Changed = false;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
/// TODO: Do not set glc for rmw atomic operations as they
/// implicitly bypass the L1 cache.
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
@ -718,10 +733,10 @@ bool SIGfx6CacheControl::enableNonTemporal(
return Changed;
}
bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
if (!InsertCacheInv)
return false;
@ -802,12 +817,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
case SIAtomicScope::WORKGROUP:
// If no cross address space ordering then an LDS waitcnt is not
// needed as LDS operations for all waves are executed in a
// total global ordering as observed by all waves. Required if
// also synchronizing with global/GDS memory as LDS operations
// could be reordered with respect to later global/GDS memory
// operations of the same wave.
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
// not needed as LDS operations for all waves are executed in a total
// global ordering as observed by all waves. Required if also
// synchronizing with global/GDS memory as LDS operations could be
// reordered with respect to later global/GDS memory operations of the
// same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WAVEFRONT:
@ -824,12 +839,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
// If no cross address space ordering then an GDS waitcnt is not
// needed as GDS operations for all waves are executed in a
// total global ordering as observed by all waves. Required if
// also synchronizing with global/LDS memory as GDS operations
// could be reordered with respect to later global/LDS memory
// operations of the same wave.
// If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
// is not needed as GDS operations for all waves are executed in a total
// global ordering as observed by all waves. Required if also
// synchronizing with global/LDS memory as GDS operations could be
// reordered with respect to later global/LDS memory operations of the
// same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WORKGROUP:
@ -859,10 +874,19 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
return Changed;
}
bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
Position Pos) const {
return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
IsCrossAddrSpaceOrdering, Pos);
}
bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
if (!InsertCacheInv)
return false;
@ -873,9 +897,9 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS()
? AMDGPU::BUFFER_WBINVL1
: AMDGPU::BUFFER_WBINVL1_VOL;
const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
? AMDGPU::BUFFER_WBINVL1
: AMDGPU::BUFFER_WBINVL1_VOL;
if (Pos == Position::AFTER)
++MI;
@ -884,7 +908,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
BuildMI(MBB, MI, DL, TII->get(Flush));
BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
Changed = true;
break;
case SIAtomicScope::WORKGROUP:
@ -902,7 +926,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
/// sequentially consistent, and no other thread can access scratch
/// memory.
/// Other address spaces do not hava a cache.
/// Other address spaces do not have a cache.
if (Pos == Position::AFTER)
--MI;
@ -930,8 +954,8 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
// CU mode and all waves of a work-group are on the same CU, and so the
// L0 does not need to be bypassed.
// CU mode all waves of a work-group are on the same CU, and so the L0
// does not need to be bypassed.
if (!CuMode) Changed |= enableGLCBit(MI);
break;
case SIAtomicScope::WAVEFRONT:
@ -964,10 +988,10 @@ bool SIGfx10CacheControl::enableNonTemporal(
return Changed;
}
bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
if (!InsertCacheInv)
return false;
@ -1074,12 +1098,12 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
case SIAtomicScope::WORKGROUP:
// If no cross address space ordering then an LDS waitcnt is not
// needed as LDS operations for all waves are executed in a
// total global ordering as observed by all waves. Required if
// also synchronizing with global/GDS memory as LDS operations
// could be reordered with respect to later global/GDS memory
// operations of the same wave.
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
// not needed as LDS operations for all waves are executed in a total
// global ordering as observed by all waves. Required if also
// synchronizing with global/GDS memory as LDS operations could be
// reordered with respect to later global/GDS memory operations of the
// same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WAVEFRONT:
@ -1096,12 +1120,12 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
// If no cross address space ordering then an GDS waitcnt is not
// needed as GDS operations for all waves are executed in a
// total global ordering as observed by all waves. Required if
// also synchronizing with global/LDS memory as GDS operations
// could be reordered with respect to later global/LDS memory
// operations of the same wave.
// If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
// is not needed as GDS operations for all waves are executed in a total
// global ordering as observed by all waves. Required if also
// synchronizing with global/LDS memory as GDS operations could be
// reordered with respect to later global/LDS memory operations of the
// same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WORKGROUP:
@ -1177,9 +1201,9 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
SIMemOp::LOAD,
MOI.getIsCrossAddressSpaceOrdering(),
Position::AFTER);
Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
}
return Changed;
@ -1203,11 +1227,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
if (MOI.isAtomic()) {
if (MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
Changed |= CC->insertWait(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
SIMemOp::LOAD | SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
Changed |= CC->insertRelease(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
return Changed;
}
@ -1239,19 +1262,23 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
/// ordering and memory scope, then library does not need to
/// generate a fence. Could add support in this file for
/// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
/// adding waitcnt before a S_BARRIER.
Changed |= CC->insertWait(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
SIMemOp::LOAD | SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
/// adding S_WAITCNT before a S_BARRIER.
Changed |= CC->insertRelease(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
// TODO: If both release and invalidate are happening they could be combined
// to use the single "BUFFER_WBL2" instruction. This could be done by
// reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
// track cache invalidate and write back instructions.
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::BEFORE);
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::BEFORE);
return Changed;
}
@ -1270,11 +1297,10 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
Changed |= CC->insertWait(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
SIMemOp::LOAD | SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
Changed |= CC->insertRelease(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
@ -1287,9 +1313,9 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
Position::AFTER);
Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
}
return Changed;