[amdgpu] Lower SGPR-to-VGPR copy in the final phase of ISel.

- Need to lower COPY from SGPR to VGPR to a real instruction as the standard COPY is used where the source and destination are from the same register bank so that we potentially coalesc them together and save one COPY. Considering that, backend optimizations, such as CSE, won't handle them. However, the copy from SGPR to VGPR always needs materializing to a native instruction, it should be lowered into a real one before other backend optimizations. Differential Revision: https://reviews.llvm.org/D87556
2020-09-09 16:48:03 -04:00 · 2020-09-09 16:48:03 -04:00 · c3492a1aa1
parent 34b27b9441
commit c3492a1aa1
7 changed files with 103 additions and 9 deletions
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@ -1244,6 +1244,11 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
        foldOperand(OpToFold, UseMI, OpNo, FoldList,
                    CopiesToReplace);
      } else {
+        // Skip updating literal use if it's used in the same REQ_SQUENCE as,
+        // if that literal could be inlined, it's just a single use.
+        if (NonInlineUse && NonInlineUse->getParent() == UseMI &&
+            UseMI->isRegSequence())
+          continue;
        if (++NumLiteralUses == 1) {
          NonInlineUse = &*Use;
          NonInlineUseOpNo = OpNo;
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@ -102,6 +102,10 @@ static cl::opt<bool> UseDivergentRegisterIndexing(
  cl::desc("Use indirect register addressing for divergent indexes"),
  cl::init(false));

+static cl::opt<bool> EnableLowerSGPRToVGPRCopy(
+    "lower-sgpr-to-vgpr-copy", cl::Hidden,
+    cl::desc("Enable lowering copy from SGPR to VGPR"), cl::init(true));
+
 static bool hasFP32Denormals(const MachineFunction &MF) {
  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
  return Info->getMode().allFP32Denormals();
@ -11485,6 +11489,59 @@ bool SITargetLowering::checkAsmConstraintValA(SDValue Op,
  return false;
 }

+// Lower COPY from SGPR to VGPR to real one as they are real transfer instead
+// of COPY.
+static void lowerSGPRToVGPRCopy(MachineFunction &MF, MachineRegisterInfo &MRI,
+                                const SIRegisterInfo &TRI,
+                                const SIInstrInfo &TII) {
+  for (MachineBasicBlock &MBB : MF) {
+    for (auto BI = MBB.begin(), BE = MBB.end(); BI != BE; /*EMPTY*/) {
+      MachineInstr &MI = *BI++;
+
+      auto IsSGPRToVGPRCopy = [&MRI, &TRI](const MachineInstr &MI) {
+        if (!MI.isCopy())
+          return false;
+
+        auto DstReg = MI.getOperand(0).getReg();
+        auto SrcReg = MI.getOperand(1).getReg();
+        const auto *DstRC = DstReg.isVirtual() ? MRI.getRegClass(DstReg)
+                                               : TRI.getPhysRegClass(DstReg);
+        const auto *SrcRC = SrcReg.isVirtual() ? MRI.getRegClass(SrcReg)
+                                               : TRI.getPhysRegClass(SrcReg);
+        return (DstRC == &AMDGPU::VGPR_32RegClass ||
+                DstRC == &AMDGPU::VReg_64RegClass) &&
+               (SrcRC == &AMDGPU::SGPR_32RegClass ||
+                SrcRC == &AMDGPU::SGPR_64RegClass);
+      };
+
+      // Skip if it's not a copy from SGPR to VGPR.
+      if (!IsSGPRToVGPRCopy(MI))
+        continue;
+
+      const MachineOperand &Src = MI.getOperand(1);
+      // FIXME: Need subreg support.
+      if (Src.getSubReg() != AMDGPU::NoSubRegister)
+        continue;
+      // FIXME: Need undef support.
+      if (Src.getReg().isVirtual()) {
+        auto *DefMI = MRI.getVRegDef(Src.getReg());
+        if (!DefMI || DefMI->isImplicitDef())
+          continue;
+      }
+
+      LLVM_DEBUG(dbgs() << "Lower COPY: " << MI);
+      unsigned Opcode = (TRI.getRegSizeInBits(Src.getReg(), MRI) == 64)
+                            ? AMDGPU::V_MOV_B64_PSEUDO
+                            : AMDGPU::V_MOV_B32_e32;
+      auto DstReg = MI.getOperand(0).getReg();
+      auto MIB = BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(Opcode), DstReg)
+                     .add(MI.getOperand(1));
+      LLVM_DEBUG(dbgs() << "        to: " << *MIB.getInstr());
+      MI.eraseFromParent();
+    }
+  }
+}
+
 // Figure out which registers should be reserved for stack access. Only after
 // the function is legalized do we know all of the non-spill stack objects or if
 // calls are present.
@ -11493,6 +11550,10 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const SIInstrInfo *TII = Subtarget->getInstrInfo();
+
+  if (EnableLowerSGPRToVGPRCopy)
+    lowerSGPRToVGPRCopy(MF, MRI, *TRI, *TII);

  if (Info->isEntryFunction()) {
    // Callable functions have fixed registers used for stack access.
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@ -11,7 +11,7 @@
 ; R600-NOT: AND
 ; R600: |PV.{{[XYZW]}}|

-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_bitset0_b32 s{{[0-9]+}}, 31
 ; VI: s_bitset0_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
  %bc= bitcast i32 %in to float
@ -24,7 +24,7 @@ define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
 ; R600-NOT: AND
 ; R600: |PV.{{[XYZW]}}|

-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_bitset0_b32 s{{[0-9]+}}, 31
 ; VI: s_bitset0_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
  %bc= bitcast i32 %in to float
@ -36,7 +36,7 @@ define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
 ; FUNC-LABEL: {{^}}s_fabs_f32:
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|

-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_bitset0_b32 s{{[0-9]+}}, 31
 ; VI: s_bitset0_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) {
  %fabs = call float @llvm.fabs.f32(float %in)
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@ -34,7 +34,7 @@ define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x
 ; R600: |PV.{{[XYZW]}}|
 ; R600: -PV

-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
 ; VI: s_bitset1_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
  %bc = bitcast i32 %in to float
@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in)
 ; R600: |PV.{{[XYZW]}}|
 ; R600: -PV

-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
  %bc = bitcast i32 %in to float
  %fabs = call float @fabs(float %bc)
@ -59,7 +59,7 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %
 }

 ; FUNC-LABEL: {{^}}fneg_fabs_f32:
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
  %fabs = call float @llvm.fabs.f32(float %in)
  %fsub = fsub float -0.000000e+00, %fabs
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll
@ -0,0 +1,26 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+; CHECK-LABEL: {{^}}t0:
+; CHECK: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[4:5], 0x0
+; CHECK-COUNT-1: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]]
+; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]]
+define protected amdgpu_kernel void @t0(float addrspace(1)* %p, i32 %i0, i32 %j0, i32 %k0) {
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %i = add i32 %0, %i0
+  %j = add i32 %0, %j0
+  %k = add i32 %0, %k0
+  %pi = getelementptr float, float addrspace(1)* %p, i32 %i
+  %vi = load float, float addrspace(1)* %pi
+  %pj = getelementptr float, float addrspace(1)* %p, i32 %j
+  %vj = load float, float addrspace(1)* %pj
+  %sum = fadd float %vi, %vj
+  %pk = getelementptr float, float addrspace(1)* %p, i32 %k
+  store float %sum, float addrspace(1)* %pk
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@ -153,7 +153,9 @@ bb:

 ; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
 ; GCN:        flat_load_dword
-; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX8_9:     s_waitcnt lgkmcnt(0){{$}}
+; GFX8_9:     s_waitcnt vmcnt(0){{$}}
+; GFX10:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX10:      s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:   s_barrier
 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) {
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@ -650,12 +650,12 @@ main_body:
 ; CHECK: image_store
 ; CHECK: s_wqm_b64 exec, exec
 ; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
-; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
+; CHECK-DAG: v_mov_b32_e32 [[SEVEN:v[0-9]+]], 0x40e00000

 ; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
 ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
 ; CHECK: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop
-; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
+; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]]
 ; CHECK: s_cbranch_vccz [[LOOPHDR]]

 ; CHECK: ; %break