[AMDGPU] Restrict immediate scratch offsets

gfx9 does not work with negative offsets, gfx10 works only with aligned negative offsets, but not with unaligned negative offsets. This is slightly more conservative than needed, gfx9 does support negative offsets when a VGPR address is used and gfx10 supports negative, unaligned offsets when an SGPR address is used, but we do not make use of that with this patch. Differential Revision: https://reviews.llvm.org/D101292
2021-05-03 10:14:12 +02:00 · 2021-05-03 10:14:12 +02:00 · 13c0316239
parent 606d4e8061
commit 13c0316239
7 changed files with 815 additions and 301 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@ -232,6 +232,18 @@ def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug",
  "GFX10 bug where inst_offset is ignored when flat instructions access global memory"
 >;

+def FeatureNegativeScratchOffsetBug : SubtargetFeature<"negative-scratch-offset-bug",
+  "NegativeScratchOffsetBug",
+  "true",
+  "Negative immediate offsets in scratch instructions with an SGPR offset page fault on GFX9"
+>;
+
+def FeatureNegativeUnalignedScratchOffsetBug : SubtargetFeature<"negative-unaligned-scratch-offset-bug",
+  "NegativeUnalignedScratchOffsetBug",
+  "true",
+  "Scratch instructions with a VGPR offset and a negative immediate offset that is not a multiple of 4 read wrong memory on GFX10"
+>;
+
 def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",
  "HasOffset3fBug",
  "true",
@ -771,7 +783,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
   FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
   FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
   FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess
+   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
+   FeatureNegativeScratchOffsetBug
  ]
 >;

@ -988,7 +1001,8 @@ def FeatureGroup {
    FeatureLdsBranchVmemWARHazard,
    FeatureNSAtoVMEMBug,
    FeatureOffset3fBug,
-    FeatureFlatSegmentOffsetBug
+    FeatureFlatSegmentOffsetBug,
+    FeatureNegativeUnalignedScratchOffsetBug
   ];
 }

--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@ -1669,7 +1669,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
  if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
    SDValue N0, N1;
    if (isBaseWithConstantOffset64(Addr, N0, N1)) {
-      uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+      int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();

      const SIInstrInfo *TII = Subtarget->getInstrInfo();
      if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
@ -1911,17 +1911,11 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N,

  if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
                              SIInstrFlags::FlatScratch)) {
-    const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true);
-    // Use signed division by a power of two to truncate towards 0.
-    int64_t D = 1LL << (NumBits - 1);
-    int64_t RemainderOffset = (COffsetVal / D) * D;
-    int64_t ImmField = COffsetVal - RemainderOffset;
+    int64_t SplitImmOffset, RemainderOffset;
+    std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
+        COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);

-    assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS,
-                                  SIInstrFlags::FlatScratch));
-    assert(RemainderOffset + ImmField == COffsetVal);
-
-    COffsetVal = ImmField;
+    COffsetVal = SplitImmOffset;

    SDLoc DL(N);
    SDValue AddOffset =
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@ -195,7 +195,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
  { }

 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                           const GCNTargetMachine &TM) :
+                           const GCNTargetMachine &TM)
+    : // clang-format off
    AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
    AMDGPUSubtarget(TT),
    TargetTriple(TT),
@ -238,6 +239,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
    GFX10_3Insts(false),
    GFX7GFX8GFX9Insts(false),
    SGPRInitBug(false),
+    NegativeScratchOffsetBug(false),
+    NegativeUnalignedScratchOffsetBug(false),
    HasSMemRealTime(false),
    HasIntClamp(false),
    HasFmaMixInsts(false),
@ -312,6 +315,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
    InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
    TLInfo(TM, *this),
    FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
+  // clang-format on
  MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
  InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@ -113,6 +113,8 @@ protected:
  bool GFX10_3Insts;
  bool GFX7GFX8GFX9Insts;
  bool SGPRInitBug;
+  bool NegativeScratchOffsetBug;
+  bool NegativeUnalignedScratchOffsetBug;
  bool HasSMemRealTime;
  bool HasIntClamp;
  bool HasFmaMixInsts;
@ -890,6 +892,12 @@ public:
    return SGPRInitBug;
  }

+  bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
+
+  bool hasNegativeUnalignedScratchOffsetBug() const {
+    return NegativeUnalignedScratchOffsetBug;
+  }
+
  bool hasMFMAInlineLiteralBug() const {
    return HasMFMAInlineLiteralBug;
  }
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -7368,6 +7368,36 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
  return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
 }

+// Depending on the used address space and instructions, some immediate offsets
+// are allowed and some are not.
+// In general, flat instruction offsets can only be non-negative, global and
+// scratch instruction offsets can also be negative.
+//
+// There are several bugs related to these offsets:
+// On gfx10.1, flat instructions that go into the global address space cannot
+// use an offset.
+//
+// For scratch instructions, the address can be either an SGPR or a VGPR.
+// The following offsets can be used, depending on the architecture (x means
+// cannot be used):
+// +----------------------------+------+------+
+// | Address-Mode               | SGPR | VGPR |
+// +----------------------------+------+------+
+// | gfx9                       |      |      |
+// | negative, 4-aligned offset | x    | ok   |
+// | negative, unaligned offset | x    | ok   |
+// +----------------------------+------+------+
+// | gfx10                      |      |      |
+// | negative, 4-aligned offset | ok   | ok   |
+// | negative, unaligned offset | ok   | x    |
+// +----------------------------+------+------+
+// | gfx10.3                    |      |      |
+// | negative, 4-aligned offset | ok   | ok   |
+// | negative, unaligned offset | ok   | ok   |
+// +----------------------------+------+------+
+//
+// This function ignores the addressing mode, so if an offset cannot be used in
+// one addressing mode, it is considered illegal.
 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
                                    uint64_t FlatVariant) const {
  // TODO: Should 0 be special cased?
@ -7380,22 +7410,44 @@ bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
    return false;

  bool Signed = FlatVariant != SIInstrFlags::FLAT;
+  if (ST.hasNegativeScratchOffsetBug() &&
+      FlatVariant == SIInstrFlags::FlatScratch)
+    Signed = false;
+  if (ST.hasNegativeUnalignedScratchOffsetBug() &&
+      FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
+      (Offset % 4) != 0) {
+    return false;
+  }
+
  unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed);
  return Signed ? isIntN(N, Offset) : isUIntN(N, Offset);
 }

+// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
 std::pair<int64_t, int64_t>
 SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
                             uint64_t FlatVariant) const {
  int64_t RemainderOffset = COffsetVal;
  int64_t ImmField = 0;
  bool Signed = FlatVariant != SIInstrFlags::FLAT;
+  if (ST.hasNegativeScratchOffsetBug() &&
+      FlatVariant == SIInstrFlags::FlatScratch)
+    Signed = false;
+
  const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, Signed);
  if (Signed) {
    // Use signed division by a power of two to truncate towards 0.
    int64_t D = 1LL << (NumBits - 1);
    RemainderOffset = (COffsetVal / D) * D;
    ImmField = COffsetVal - RemainderOffset;
+
+    if (ST.hasNegativeUnalignedScratchOffsetBug() &&
+        FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
+        (ImmField % 4) != 0) {
+      // Make ImmField a multiple of 4
+      RemainderOffset += ImmField % 4;
+      ImmField -= ImmField % 4;
+    }
  } else if (COffsetVal >= 0) {
    ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
    RemainderOffset = COffsetVal - ImmField;
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@ -279,30 +279,31 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(<3 x i64> addrspace(1
 ; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; FLATSCR-NEXT:    s_add_u32 s2, 16, 0x4000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT:    s_movk_i32 vcc_hi, 0x2000
-; FLATSCR-NEXT:    s_mov_b32 s3, 0
-; FLATSCR-NEXT:    scratch_store_dword off, v0, vcc_hi
+; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT:    s_mov_b32 s2, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v0, vcc_hi offset:1024
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:  BB2_1: ; %loadstoreloop
 ; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; FLATSCR-NEXT:    s_add_u32 s4, 0x4000, s3
-; FLATSCR-NEXT:    s_add_i32 s3, s3, 1
-; FLATSCR-NEXT:    s_cmpk_lt_u32 s3, 0x2120
-; FLATSCR-NEXT:    scratch_store_byte off, v0, s4
+; FLATSCR-NEXT:    s_add_u32 s3, 0x2000, s2
+; FLATSCR-NEXT:    s_add_i32 s2, s2, 1
+; FLATSCR-NEXT:    s_cmpk_lt_u32 s2, 0x2120
+; FLATSCR-NEXT:    scratch_store_byte off, v0, s3
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_cbranch_scc1 BB2_1
 ; FLATSCR-NEXT:  ; %bb.2: ; %split
-; FLATSCR-NEXT:    s_movk_i32 s3, 0x1000
-; FLATSCR-NEXT:    s_add_u32 s3, 0x4000, s3
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s3 offset:720 glc
+; FLATSCR-NEXT:    s_movk_i32 s2, 0x1000
+; FLATSCR-NEXT:    s_add_u32 s2, 0x2000, s2
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s2 offset:720 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s3 offset:704 glc
+; FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 offset:704 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s2 glc
+; FLATSCR-NEXT:    s_movk_i32 s2, 0x2000
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s2 offset:16 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_load_dwordx4 v[4:7], off, s2 offset:-16 glc
+; FLATSCR-NEXT:    s_movk_i32 s2, 0x2000
+; FLATSCR-NEXT:    scratch_load_dwordx4 v[4:7], off, s2 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_mov_b32_e32 v12, 0
 ; FLATSCR-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6