AMDGPU: Track physreg uses in SILoadStoreOptimizer

Summary: This handles def-after-use of physregs, and allows us to merge loads and stores even across some physreg defs (typically M0 defs). Change-Id: I076484b2bda27c2cf46013c845a0380c5b89b67b Reviewers: arsenm, mareko, rampitec Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D42647 llvm-svn: 325882
2018-02-23 10:45:56 +00:00 · 2018-02-23 10:45:56 +00:00 · 6cf306deca
parent 43c1115cd4
commit 6cf306deca
4 changed files with 190 additions and 42 deletions
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@ -173,10 +173,17 @@ static void moveInstsAfter(MachineBasicBlock::iterator I,
  }
 }

-static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
-  for (const MachineOperand &Def : MI.operands()) {
-    if (Def.isReg() && Def.isDef())
-      Defs.insert(Def.getReg());
+static void addDefsUsesToList(const MachineInstr &MI,
+                              DenseSet<unsigned> &RegDefs,
+                              DenseSet<unsigned> &PhysRegUses) {
+  for (const MachineOperand &Op : MI.operands()) {
+    if (Op.isReg()) {
+      if (Op.isDef())
+        RegDefs.insert(Op.getReg());
+      else if (Op.readsReg() &&
+               TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
+        PhysRegUses.insert(Op.getReg());
+    }
  }
 }

@ -195,16 +202,24 @@ static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
 // already in the list. Returns true in that case.
 static bool
 addToListsIfDependent(MachineInstr &MI,
-                      DenseSet<unsigned> &Defs,
+                      DenseSet<unsigned> &RegDefs,
+                      DenseSet<unsigned> &PhysRegUses,
                      SmallVectorImpl<MachineInstr*> &Insts) {
  for (MachineOperand &Use : MI.operands()) {
    // If one of the defs is read, then there is a use of Def between I and the
    // instruction that I will potentially be merged with. We will need to move
    // this instruction after the merged instructions.
-
-    if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
+    //
+    // Similarly, if there is a def which is read by an instruction that is to
+    // be moved for merging, then we need to move the def-instruction as well.
+    // This can only happen for physical registers such as M0; virtual
+    // registers are in SSA form.
+    if (Use.isReg() &&
+        ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
+         (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
+          PhysRegUses.count(Use.getReg())))) {
      Insts.push_back(&MI);
-      addDefsToList(MI, Defs);
+      addDefsUsesToList(MI, RegDefs, PhysRegUses);
      return true;
    }
  }
@ -228,16 +243,6 @@ canMoveInstsAcrossMemOp(MachineInstr &MemOp,
  return true;
 }

-static bool
-hasPhysRegDef(MachineInstr &MI) {
-  for (const MachineOperand &Def : MI.defs()) {
-    if (Def.isReg() &&
-        TargetRegisterInfo::isPhysicalRegister(Def.getReg()))
-      return true;
-  }
-  return false;
-}
-
 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
  // XXX - Would the same offset be OK? Is there any reason this would happen or
  // be useful?
@ -343,8 +348,9 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {

  ++MBBI;

-  DenseSet<unsigned> DefsToMove;
-  addDefsToList(*CI.I, DefsToMove);
+  DenseSet<unsigned> RegDefsToMove;
+  DenseSet<unsigned> PhysRegUsesToMove;
+  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);

  for ( ; MBBI != E; ++MBBI) {
    if (MBBI->getOpcode() != CI.I->getOpcode()) {
@ -360,13 +366,6 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
        return false;
      }

-      if (hasPhysRegDef(*MBBI)) {
-        // We could re-order this instruction in theory, but it would require
-        // tracking physreg defs and uses. This should only affect M0 in
-        // practice.
-        return false;
-      }
-
      if (MBBI->mayLoadOrStore() &&
        (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
         !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
@ -374,14 +373,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
        // #2.  Add this instruction to the move list and then we will check
        // if condition #2 holds once we have selected the matching instruction.
        CI.InstsToMove.push_back(&*MBBI);
-        addDefsToList(*MBBI, DefsToMove);
+        addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
        continue;
      }

      // When we match I with another DS instruction we will be moving I down
      // to the location of the matched instruction any uses of I will need to
      // be moved down as well.
-      addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
+      addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
+                            CI.InstsToMove);
      continue;
    }

@ -395,7 +395,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
    //   DS_WRITE_B32 addr, f(w), idx1
    // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
    // merging of the two writes.
-    if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
+    if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
+                              CI.InstsToMove))
      continue;

    bool Match = true;
@ -454,8 +455,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
    // down past this instruction.
    // check if we can move I across MBBI and if we can move all I's users
    if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
-        !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA) ||
-        hasPhysRegDef(*MBBI))
+        !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
      break;
  }
  return false;
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@ -629,6 +629,27 @@ define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspa
  ret void
 }

+; GCN-LABEL: {{^}}ds_read_interp_read:
+; CI: s_mov_b32 m0, -1
+; CI: ds_read_b32
+; CI: s_mov_b32 m0, s0
+; CI: v_interp_mov_f32
+; CI: s_mov_b32 m0, -1
+; CI: ds_read_b32
+; GFX9: ds_read2_b32 v[0:1], v0 offset1:4
+; GFX9: s_mov_b32 m0, s0
+; GFX9: v_interp_mov_f32
+define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrspace(3)* %inptr) {
+  %v0 = load float, float addrspace(3)* %inptr, align 4
+  %intrp = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 %prims)
+  %ptr1 = getelementptr float, float addrspace(3)* %inptr, i32 4
+  %v1 = load float, float addrspace(3)* %ptr1, align 4
+  %v1b = fadd float %v1, %intrp
+  %r0 = insertelement <2 x float> undef, float %v0, i32 0
+  %r1 = insertelement <2 x float> %r0, float %v1b, i32 1
+  ret <2 x float> %r1
+}
+
 declare void @void_func_void() #3

 declare i32 @llvm.amdgcn.workgroup.id.x() #1
@ -636,6 +657,8 @@ declare i32 @llvm.amdgcn.workgroup.id.y() #1
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare i32 @llvm.amdgcn.workitem.id.y() #1

+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) nounwind readnone
+
 declare void @llvm.amdgcn.s.barrier() #2

 attributes #0 = { nounwind }
--- a/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir
@ -0,0 +1,116 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s
+
+# Check that SILoadStoreOptimizer honors physregs defs/uses between moved
+# instructions.
+#
+# The following IR snippet would usually be optimized by the peephole optimizer.
+# However, an equivalent situation can occur with buffer instructions as well.
+
+# CHECK-LABEL: name: scc_def_and_use_no_dependency
+# CHECK: S_ADD_U32
+# CHECK: S_ADDC_U32
+# CHECK: DS_READ2_B32
+--- |
+  define amdgpu_kernel void @scc_def_and_use_no_dependency(i32 addrspace(3)* %ptr.0) nounwind {
+    %ptr.4 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 1
+    %ptr.64 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 16
+    ret void
+  }
+
+  define amdgpu_kernel void @scc_def_and_use_dependency(i32 addrspace(3)* %ptr.0) nounwind {
+    %ptr.4 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 1
+    %ptr.64 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 16
+    ret void
+  }
+...
+---
+name:            scc_def_and_use_no_dependency
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: false
+liveins:
+  - { reg: '$vgpr0' }
+  - { reg: '$sgpr0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0
+
+    %1:vgpr_32 = COPY $vgpr0
+    %10:sgpr_32 = COPY $sgpr0
+
+    $m0 = S_MOV_B32 -1
+    %2:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.0)
+
+    %11:sgpr_32 = S_ADD_U32 %10, 4, implicit-def $scc
+    %12:sgpr_32 = S_ADDC_U32 %10, 0, implicit-def dead $scc, implicit $scc
+
+    %3:vgpr_32 = DS_READ_B32 %1, 64, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.64)
+    S_ENDPGM
+
+...
+
+# CHECK-LABEL: name: scc_def_and_use_dependency
+# CHECK: DS_READ2_B32
+# CHECK: S_ADD_U32
+# CHECK: S_ADDC_U32
+---
+name:            scc_def_and_use_dependency
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: false
+liveins:
+  - { reg: '$vgpr0' }
+  - { reg: '$sgpr0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0
+
+    %1:vgpr_32 = COPY $vgpr0
+    %10:sgpr_32 = COPY $sgpr0
+
+    $m0 = S_MOV_B32 -1
+    %2:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.0)
+    %20:sgpr_32 = V_READFIRSTLANE_B32 %2, implicit $exec
+
+    %21:sgpr_32 = S_ADD_U32 %20, 4, implicit-def $scc
+    ; The S_ADDC_U32 depends on the first DS_READ_B32 only via SCC
+    %11:sgpr_32 = S_ADDC_U32 %10, 0, implicit-def dead $scc, implicit $scc
+
+    %3:vgpr_32 = DS_READ_B32 %1, 64, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.64)
+    S_ENDPGM
+
+...
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti  -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI   -check-prefix=GCN -check-prefix=SICI -check-prefix=SIVIGFX9 %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI   -check-prefix=GCN -check-prefix=SICI %s
-; RUN: llc -march=amdgcn -mcpu=tonga   -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI   -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tahiti  -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI   -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI -check-prefix=SIVIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI   -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI %s
+; RUN: llc -march=amdgcn -mcpu=tonga   -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI   -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900  -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9  %s

 ; SMRD load with an immediate offset.
@ -232,15 +232,24 @@ main_body:
  ret void
 }

-; GCN-LABEL: {{^}}smrd_imm_nomerge_m0:
+; GCN-LABEL: {{^}}smrd_imm_merge_m0:
 ;
-; In principle we could merge the loads here as well, but it would require
-; careful tracking of physical registers since both v_interp* and v_movrel*
-; instructions (or gpr idx mode) use M0.
+; SICIVI: s_buffer_load_dwordx2
+; SICIVI: s_mov_b32 m0
+; SICIVI_DAG: v_interp_p1_f32
+; SICIVI_DAG: v_interp_p1_f32
+; SICIVI_DAG: v_interp_p1_f32
+; SICIVI_DAG: v_interp_p2_f32
+; SICIVI_DAG: v_interp_p2_f32
+; SICIVI_DAG: v_interp_p2_f32
+; SICIVI: s_mov_b32 m0
+; SICIVI: v_movrels_b32_e32
 ;
-; GCN: s_buffer_load_dword
-; GCN: s_buffer_load_dword
-define amdgpu_ps float @smrd_imm_nomerge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
+; Merging is still thwarted on GFX9 due to s_set_gpr_idx
+;
+; GFX9: s_buffer_load_dword
+; GFX9: s_buffer_load_dword
+define amdgpu_ps float @smrd_imm_merge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
 main_body:
  %idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0)
  %idx1 = bitcast float %idx1.f to i32