AMDGPU: Fix broken FrameIndex handling

We were trying to avoid using a FrameIndex operand in non-pointer operands in a convoluted way, and would break because of using TargetFrameIndex. The TargetFrameIndex should only be used in the case where it makes sense to fold it as part of the addressing mode, otherwise it requires materialization like a normal constant. This wasn't working reliably and failed in the added testcase, hitting the assert when processing the frame index. The TargetFrameIndex was coming from trying to produce an AssertZext limiting the maximum stack size. I'm not sure this was correct to begin with, because it is apparently possible to have a single workitem dispatch that requires all 4G of private memory. llvm-svn: 281824
2016-09-17 16:09:55 +00:00 · 2016-09-17 16:09:55 +00:00 · ac0fc849cf
parent bcfd94c298
commit ac0fc849cf
7 changed files with 66 additions and 108 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@ -51,10 +51,10 @@ public:
  bool runOnMachineFunction(MachineFunction &MF) override;
  void Select(SDNode *N) override;
  const char *getPassName() const override;
-  void PreprocessISelDAG() override;
  void PostprocessISelDAG() override;

 private:
+  SDValue foldFrameIndex(SDValue N) const;
  bool isInlineImmediate(const SDNode *N) const;
  bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
                   const R600InstrInfo *TII);
@ -902,6 +902,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
  return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
 }

+SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+  if (auto FI = dyn_cast<FrameIndexSDNode>(N))
+    return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
+  return N;
+}
+
 bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
                                            SDValue &VAddr, SDValue &SOffset,
                                            SDValue &ImmOffset) const {
@ -921,14 +927,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
    // Offsets in vaddr must be positive.
    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
    if (isLegalMUBUFImmOffset(C1)) {
-      VAddr = N0;
+      VAddr = foldFrameIndex(N0);
      ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
      return true;
    }
  }

  // (node)
-  VAddr = Addr;
+  VAddr = foldFrameIndex(Addr);
  ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
  return true;
 }
@ -1516,62 +1522,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
  return SelectVOP3Mods(In, Src, SrcMods);
 }

-void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
-  MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo();
-
-  // Handle the perverse case where a frame index is being stored. We don't
-  // want to see multiple frame index operands on the same instruction since
-  // it complicates things and violates some assumptions about frame index
-  // lowering.
-  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
-       I != E; ++I) {
-    SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32);
-
-    // It's possible that we have a frame index defined in the function that
-    // isn't used in this block.
-    if (FI.use_empty())
-      continue;
-
-    // Skip over the AssertZext inserted during lowering.
-    SDValue EffectiveFI = FI;
-    auto It = FI->use_begin();
-    if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) {
-      EffectiveFI = SDValue(*It, 0);
-      It = EffectiveFI->use_begin();
-    }
-
-    for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) {
-      SDUse &Use = It.getUse();
-      SDNode *User = Use.getUser();
-      unsigned OpIdx = It.getOperandNo();
-      ++It;
-
-      if (MemSDNode *M = dyn_cast<MemSDNode>(User)) {
-        unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1;
-        if (OpIdx == PtrIdx)
-          continue;
-
-        unsigned OpN = M->getNumOperands();
-        SDValue NewOps[8];
-
-        assert(OpN < array_lengthof(NewOps));
-        for (unsigned Op = 0; Op != OpN; ++Op) {
-          if (Op != OpIdx) {
-            NewOps[Op] = M->getOperand(Op);
-            continue;
-          }
-
-          MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
-                                                      SDLoc(M), MVT::i32, FI);
-          NewOps[Op] = SDValue(Mov, 0);
-        }
-
-        CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN));
-      }
-    }
-  }
-}
-
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
  const AMDGPUTargetLowering& Lowering =
    *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@ -89,7 +89,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);

  setOperationAction(ISD::SELECT, MVT::i1, Promote);
@ -1558,7 +1557,6 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
  switch (Op.getOpcode()) {
  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
  case ISD::LOAD: {
    SDValue Result = LowerLOAD(Op, DAG);
@ -1605,43 +1603,6 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
  return nullptr;
 }

-SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
-
-  SDLoc SL(Op);
-  FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
-  unsigned FrameIndex = FINode->getIndex();
-
-  // A FrameIndex node represents a 32-bit offset into scratch memory. If the
-  // high bit of a frame index offset were to be set, this would mean that it
-  // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
-  // buffer, with 64 being the number of threads per wave.
-  //
-  // The maximum private allocation for the entire GPU is 4G, and we are
-  // concerned with the largest the index could ever be for an individual
-  // workitem. This will occur with the minmum dispatch size. If a program
-  // requires more, the dispatch size will be reduced.
-  //
-  // With this limit, we can mark the high bit of the FrameIndex node as known
-  // zero, which is important, because it means in most situations we can prove
-  // that values derived from FrameIndex nodes are non-negative. This enables us
-  // to take advantage of more addressing modes when accessing scratch buffers,
-  // since for scratch reads/writes, the register offset must always be
-  // positive.
-
-  uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
-
-  // XXX - It is unclear if partial dispatch works. Assume it works at half wave
-  // granularity. It is probably a full wave.
-  uint64_t MinGranularity = 32;
-
-  unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
-  EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
-
-  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
-  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
-                     DAG.getValueType(ExtVT));
-}
-
 bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
  if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
    switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@ -33,7 +33,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
  SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@ -278,6 +278,11 @@ return CurDAG->getTargetConstant(
  N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
 }]>;

+def frameindex_to_targetframeindex : SDNodeXForm<frameindex, [{
+  auto FI = cast<FrameIndexSDNode>(N);
+  return CurDAG->getTargetFrameIndex(FI->getIndex(), MVT::i32);
+}]>;
+
 // Copied from the AArch64 backend:
 def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
 return CurDAG->getTargetConstant(
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@ -1581,6 +1581,11 @@ def : Pat <
  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
 >;

+def : Pat <
+ (i32 frameindex:$fi),
+ (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
+>;
+
 def : Pat <
  (i64 InlineImm<i64>:$imm),
  (S_MOV_B64 InlineImm<i64>:$imm)
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@ -1,5 +1,17 @@
 ; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

+; GCN-LABEL: {{^}}store_fi_lifetime:
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[FI]]
+define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
+entry:
+  %b = alloca i8
+  call void @llvm.lifetime.start(i64 1, i8* %b)
+  store volatile i8* %b, i8* addrspace(1)* undef
+  call void @llvm.lifetime.end(i64 1, i8* %b)
+  ret void
+}
+
 ; GCN-LABEL: {{^}}stored_fi_to_lds:
 ; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
 ; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
@ -140,17 +152,18 @@ define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
 }

 ; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset:
-; GCN: s_add_i32 [[BASE_1_OFF_0:s[0-9]+]], 0, 0x3ffc
+; GCN: v_mov_b32_e32 [[VAL_0:v[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword [[BASE_0]], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: buffer_store_dword [[VAL_0]], [[BASE_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+
+; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 0{{$}}
+; GCN: v_add_i32_e32 [[BASE_1_OFF_0:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]]

-; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_0:v[0-9]+]], [[BASE_1_OFF_0]]
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
-; GCN: s_add_i32 [[BASE_1_OFF_1:s[0-9]+]], 0, 56
-; GCN: buffer_store_dword [[K]], [[V_BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 56, [[BASE_0_1]]
+; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}

-; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_1:v[0-9]+]], [[BASE_1_OFF_1]]
-; GCN: buffer_store_dword [[V_BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN: buffer_store_dword [[BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
  %tmp0 = alloca [4096 x i32]
  %tmp1 = alloca [4096 x i32]
@ -163,4 +176,27 @@ define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
  ret void
 }

+@g1 = external addrspace(1) global i32*
+
+; This was leaving a dead node around resulting in failing to select
+; on the leftover AssertZext's ValueType operand.
+
+; GCN-LABEL: {{^}}cannot_select_assertzext_valuetype:
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, g1@GOTPCREL+4
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[FI]]
+define void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
+entry:
+  %b = alloca i32, align 4
+  %tmp1 = load volatile i32*, i32* addrspace(1)* @g1, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %tmp1, i32 %idx
+  %tmp2 = load i32, i32* %arrayidx, align 4
+  store volatile i32* %b, i32* addrspace(1)* undef
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
 attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
--- a/llvm/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
@ -7,8 +7,10 @@
 ;
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
-; CHECK: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]]
-; CHECK: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]]
+; CHECK-DAG: v_mov_b32_e32 [[ZERO_BASE_FI:v[0-9]+]], 0{{$}}
+; CHECK-DAG: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]]
+; CHECK-DAG: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]]
+
 ; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
 ; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
 define amdgpu_ps float @main(i32 %idx) {