[AMDGPU] Cleanup memory legalizer interfaces

- Rename interfaces to be in terms of acquire and release. - Improve comments. Differential Revision: https://reviews.llvm.org/D89355
2020-10-13 02:06:33 +00:00 · 2020-10-13 02:06:33 +00:00 · 907d799070
parent 4b6e3ad9b4
commit 907d799070
1 changed files with 124 additions and 98 deletions
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@ -280,23 +280,23 @@ public:
  virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
    const = 0;

-  /// Inserts any necessary instructions at position \p Pos relative
-  /// to instruction \p MI to ensure any caches associated with
-  /// address spaces \p AddrSpace for memory scopes up to memory scope
-  /// \p Scope are invalidated. Returns true iff any instructions
-  /// inserted.
-  virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
-                                     SIAtomicScope Scope,
-                                     SIAtomicAddrSpace AddrSpace,
-                                     Position Pos) const = 0;
+  /// Inserts any necessary instructions at position \p Pos relative to
+  /// instruction \p MI to ensure any subsequent memory instructions of this
+  /// thread with address spaces \p AddrSpace will observe the previous memory
+  /// operations by any thread for memory scopes up to memory scope \p Scope .
+  /// Returns true iff any instructions inserted.
+  virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace,
+                             Position Pos) const = 0;

  /// Inserts any necessary instructions at position \p Pos relative
-  /// to instruction \p MI to ensure memory instructions of kind \p Op
-  /// associated with address spaces \p AddrSpace have completed as
-  /// observed by other memory instructions executing in memory scope
-  /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
-  /// ordering is between address spaces. Returns true iff any
-  /// instructions inserted.
+  /// to instruction \p MI to ensure memory instructions before \p Pos of kind
+  /// \p Op associated with address spaces \p AddrSpace have completed. Used
+  /// between memory instructions to enforce the order they become visible as
+  /// observed by other memory instructions executing in memory scope \p Scope.
+  /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
+  /// address spaces. Returns true iff any instructions inserted.
  virtual bool insertWait(MachineBasicBlock::iterator &MI,
                          SIAtomicScope Scope,
                          SIAtomicAddrSpace AddrSpace,
@ -304,6 +304,18 @@ public:
                          bool IsCrossAddrSpaceOrdering,
                          Position Pos) const = 0;

+  /// Inserts any necessary instructions at position \p Pos relative to
+  /// instruction \p MI to ensure previous memory instructions by this thread
+  /// with address spaces \p AddrSpace have completed and can be observed by
+  /// subsequent memory instructions by any thread executing in memory scope \p
+  /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
+  /// between address spaces. Returns true iff any instructions inserted.
+  virtual bool insertRelease(MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace,
+                             bool IsCrossAddrSpaceOrdering,
+                             Position Pos) const = 0;
+
  /// Virtual destructor to allow derivations to be deleted.
  virtual ~SICacheControl() = default;

@ -334,10 +346,16 @@ public:

  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;

-  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
-                             SIAtomicScope Scope,
-                             SIAtomicAddrSpace AddrSpace,
-                             Position Pos) const override;
+  bool insertAcquire(MachineBasicBlock::iterator &MI,
+                     SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace,
+                     Position Pos) const override;
+
+  bool insertRelease(MachineBasicBlock::iterator &MI,
+                     SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace,
+                     bool IsCrossAddrSpaceOrdering,
+                     Position Pos) const override;

  bool insertWait(MachineBasicBlock::iterator &MI,
                  SIAtomicScope Scope,
@ -352,10 +370,10 @@ public:

  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};

-  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
-                             SIAtomicScope Scope,
-                             SIAtomicAddrSpace AddrSpace,
-                             Position Pos) const override;
+  bool insertAcquire(MachineBasicBlock::iterator &MI,
+                     SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace,
+                     Position Pos) const override;

 };

@ -380,10 +398,10 @@ public:

  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;

-  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
-                             SIAtomicScope Scope,
-                             SIAtomicAddrSpace AddrSpace,
-                             Position Pos) const override;
+  bool insertAcquire(MachineBasicBlock::iterator &MI,
+                     SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace,
+                     Position Pos) const override;

  bool insertWait(MachineBasicBlock::iterator &MI,
                  SIAtomicScope Scope,
@ -678,9 +696,6 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
  bool Changed = false;

  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
-    /// TODO: Do not set glc for rmw atomic operations as they
-    /// implicitly bypass the L1 cache.
-
    switch (Scope) {
    case SIAtomicScope::SYSTEM:
    case SIAtomicScope::AGENT:
@ -718,10 +733,10 @@ bool SIGfx6CacheControl::enableNonTemporal(
  return Changed;
 }

-bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
-                                               SIAtomicScope Scope,
-                                               SIAtomicAddrSpace AddrSpace,
-                                               Position Pos) const {
+bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+                                       SIAtomicScope Scope,
+                                       SIAtomicAddrSpace AddrSpace,
+                                       Position Pos) const {
  if (!InsertCacheInv)
    return false;

@ -802,12 +817,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
    case SIAtomicScope::SYSTEM:
    case SIAtomicScope::AGENT:
    case SIAtomicScope::WORKGROUP:
-      // If no cross address space ordering then an LDS waitcnt is not
-      // needed as LDS operations for all waves are executed in a
-      // total global ordering as observed by all waves. Required if
-      // also synchronizing with global/GDS memory as LDS operations
-      // could be reordered with respect to later global/GDS memory
-      // operations of the same wave.
+      // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
+      // not needed as LDS operations for all waves are executed in a total
+      // global ordering as observed by all waves. Required if also
+      // synchronizing with global/GDS memory as LDS operations could be
+      // reordered with respect to later global/GDS memory operations of the
+      // same wave.
      LGKMCnt |= IsCrossAddrSpaceOrdering;
      break;
    case SIAtomicScope::WAVEFRONT:
@ -824,12 +839,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
    switch (Scope) {
    case SIAtomicScope::SYSTEM:
    case SIAtomicScope::AGENT:
-      // If no cross address space ordering then an GDS waitcnt is not
-      // needed as GDS operations for all waves are executed in a
-      // total global ordering as observed by all waves. Required if
-      // also synchronizing with global/LDS memory as GDS operations
-      // could be reordered with respect to later global/LDS memory
-      // operations of the same wave.
+      // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
+      // is not needed as GDS operations for all waves are executed in a total
+      // global ordering as observed by all waves. Required if also
+      // synchronizing with global/LDS memory as GDS operations could be
+      // reordered with respect to later global/LDS memory operations of the
+      // same wave.
      LGKMCnt |= IsCrossAddrSpaceOrdering;
      break;
    case SIAtomicScope::WORKGROUP:
@ -859,10 +874,19 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
  return Changed;
 }

-bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
-                                               SIAtomicScope Scope,
-                                               SIAtomicAddrSpace AddrSpace,
-                                               Position Pos) const {
+bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
+                                       SIAtomicScope Scope,
+                                       SIAtomicAddrSpace AddrSpace,
+                                       bool IsCrossAddrSpaceOrdering,
+                                       Position Pos) const {
+    return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+                      IsCrossAddrSpaceOrdering, Pos);
+}
+
+bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+                                       SIAtomicScope Scope,
+                                       SIAtomicAddrSpace AddrSpace,
+                                       Position Pos) const {
  if (!InsertCacheInv)
    return false;

@ -873,9 +897,9 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,

  const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();

-  const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS()
-                             ? AMDGPU::BUFFER_WBINVL1
-                             : AMDGPU::BUFFER_WBINVL1_VOL;
+  const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
+                                    ? AMDGPU::BUFFER_WBINVL1
+                                    : AMDGPU::BUFFER_WBINVL1_VOL;

  if (Pos == Position::AFTER)
    ++MI;
@ -884,7 +908,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
    switch (Scope) {
    case SIAtomicScope::SYSTEM:
    case SIAtomicScope::AGENT:
-      BuildMI(MBB, MI, DL, TII->get(Flush));
+      BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
      Changed = true;
      break;
    case SIAtomicScope::WORKGROUP:
@ -902,7 +926,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
  /// sequentially consistent, and no other thread can access scratch
  /// memory.

-  /// Other address spaces do not hava a cache.
+  /// Other address spaces do not have a cache.

  if (Pos == Position::AFTER)
    --MI;
@ -930,8 +954,8 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
    case SIAtomicScope::WORKGROUP:
      // In WGP mode the waves of a work-group can be executing on either CU of
      // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
-      // CU mode and all waves of a work-group are on the same CU, and so the
-      // L0 does not need to be bypassed.
+      // CU mode all waves of a work-group are on the same CU, and so the L0
+      // does not need to be bypassed.
      if (!CuMode) Changed |= enableGLCBit(MI);
      break;
    case SIAtomicScope::WAVEFRONT:
@ -964,10 +988,10 @@ bool SIGfx10CacheControl::enableNonTemporal(
  return Changed;
 }

-bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
-                                                SIAtomicScope Scope,
-                                                SIAtomicAddrSpace AddrSpace,
-                                                Position Pos) const {
+bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+                                        SIAtomicScope Scope,
+                                        SIAtomicAddrSpace AddrSpace,
+                                        Position Pos) const {
  if (!InsertCacheInv)
    return false;

@ -1074,12 +1098,12 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
    case SIAtomicScope::SYSTEM:
    case SIAtomicScope::AGENT:
    case SIAtomicScope::WORKGROUP:
-      // If no cross address space ordering then an LDS waitcnt is not
-      // needed as LDS operations for all waves are executed in a
-      // total global ordering as observed by all waves. Required if
-      // also synchronizing with global/GDS memory as LDS operations
-      // could be reordered with respect to later global/GDS memory
-      // operations of the same wave.
+      // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
+      // not needed as LDS operations for all waves are executed in a total
+      // global ordering as observed by all waves. Required if also
+      // synchronizing with global/GDS memory as LDS operations could be
+      // reordered with respect to later global/GDS memory operations of the
+      // same wave.
      LGKMCnt |= IsCrossAddrSpaceOrdering;
      break;
    case SIAtomicScope::WAVEFRONT:
@ -1096,12 +1120,12 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
    switch (Scope) {
    case SIAtomicScope::SYSTEM:
    case SIAtomicScope::AGENT:
-      // If no cross address space ordering then an GDS waitcnt is not
-      // needed as GDS operations for all waves are executed in a
-      // total global ordering as observed by all waves. Required if
-      // also synchronizing with global/LDS memory as GDS operations
-      // could be reordered with respect to later global/LDS memory
-      // operations of the same wave.
+      // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
+      // is not needed as GDS operations for all waves are executed in a total
+      // global ordering as observed by all waves. Required if also
+      // synchronizing with global/LDS memory as GDS operations could be
+      // reordered with respect to later global/LDS memory operations of the
+      // same wave.
      LGKMCnt |= IsCrossAddrSpaceOrdering;
      break;
    case SIAtomicScope::WORKGROUP:
@ -1177,9 +1201,9 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
                                SIMemOp::LOAD,
                                MOI.getIsCrossAddressSpaceOrdering(),
                                Position::AFTER);
-      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
-                                           MOI.getOrderingAddrSpace(),
-                                           Position::AFTER);
+      Changed |= CC->insertAcquire(MI, MOI.getScope(),
+                                   MOI.getOrderingAddrSpace(),
+                                   Position::AFTER);
    }

    return Changed;
@ -1203,11 +1227,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
  if (MOI.isAtomic()) {
    if (MOI.getOrdering() == AtomicOrdering::Release ||
        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-      Changed |= CC->insertWait(MI, MOI.getScope(),
-                                MOI.getOrderingAddrSpace(),
-                                SIMemOp::LOAD | SIMemOp::STORE,
-                                MOI.getIsCrossAddressSpaceOrdering(),
-                                Position::BEFORE);
+      Changed |= CC->insertRelease(MI, MOI.getScope(),
+                                   MOI.getOrderingAddrSpace(),
+                                   MOI.getIsCrossAddressSpaceOrdering(),
+                                   Position::BEFORE);

    return Changed;
  }
@ -1239,19 +1262,23 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
      /// ordering and memory scope, then library does not need to
      /// generate a fence. Could add support in this file for
      /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
-      /// adding waitcnt before a S_BARRIER.
-      Changed |= CC->insertWait(MI, MOI.getScope(),
-                                MOI.getOrderingAddrSpace(),
-                                SIMemOp::LOAD | SIMemOp::STORE,
-                                MOI.getIsCrossAddressSpaceOrdering(),
-                                Position::BEFORE);
+      /// adding S_WAITCNT before a S_BARRIER.
+      Changed |= CC->insertRelease(MI, MOI.getScope(),
+                                   MOI.getOrderingAddrSpace(),
+                                   MOI.getIsCrossAddressSpaceOrdering(),
+                                   Position::BEFORE);
+
+    // TODO: If both release and invalidate are happening they could be combined
+    // to use the single "BUFFER_WBL2" instruction. This could be done by
+    // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
+    // track cache invalidate and write back instructions.

    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
-                                           MOI.getOrderingAddrSpace(),
-                                           Position::BEFORE);
+      Changed |= CC->insertAcquire(MI, MOI.getScope(),
+                                   MOI.getOrderingAddrSpace(),
+                                   Position::BEFORE);

    return Changed;
  }
@ -1270,11 +1297,10 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
        MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
-      Changed |= CC->insertWait(MI, MOI.getScope(),
-                                MOI.getOrderingAddrSpace(),
-                                SIMemOp::LOAD | SIMemOp::STORE,
-                                MOI.getIsCrossAddressSpaceOrdering(),
-                                Position::BEFORE);
+      Changed |= CC->insertRelease(MI, MOI.getScope(),
+                                   MOI.getOrderingAddrSpace(),
+                                   MOI.getIsCrossAddressSpaceOrdering(),
+                                   Position::BEFORE);

    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
@ -1287,9 +1313,9 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
                                                   SIMemOp::STORE,
                                MOI.getIsCrossAddressSpaceOrdering(),
                                Position::AFTER);
-      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
-                                           MOI.getOrderingAddrSpace(),
-                                           Position::AFTER);
+      Changed |= CC->insertAcquire(MI, MOI.getScope(),
+                                   MOI.getOrderingAddrSpace(),
+                                   Position::AFTER);
    }

    return Changed;