[amdgpu] Lower SGPR-to-VGPR copy in the final phase of ISel.

- Need to lower COPY from SGPR to VGPR to a real instruction as the
  standard COPY is used where the source and destination are from the
  same register bank so that we potentially coalesc them together and
  save one COPY. Considering that, backend optimizations, such as CSE,
  won't handle them. However, the copy from SGPR to VGPR always needs
  materializing to a native instruction, it should be lowered into a
  real one before other backend optimizations.

Differential Revision: https://reviews.llvm.org/D87556
This commit is contained in:
Michael Liao 2020-09-09 16:48:03 -04:00
parent 34b27b9441
commit c3492a1aa1
7 changed files with 103 additions and 9 deletions

View File

@ -1244,6 +1244,11 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
foldOperand(OpToFold, UseMI, OpNo, FoldList,
CopiesToReplace);
} else {
// Skip updating literal use if it's used in the same REQ_SQUENCE as,
// if that literal could be inlined, it's just a single use.
if (NonInlineUse && NonInlineUse->getParent() == UseMI &&
UseMI->isRegSequence())
continue;
if (++NumLiteralUses == 1) {
NonInlineUse = &*Use;
NonInlineUseOpNo = OpNo;

View File

@ -102,6 +102,10 @@ static cl::opt<bool> UseDivergentRegisterIndexing(
cl::desc("Use indirect register addressing for divergent indexes"),
cl::init(false));
static cl::opt<bool> EnableLowerSGPRToVGPRCopy(
"lower-sgpr-to-vgpr-copy", cl::Hidden,
cl::desc("Enable lowering copy from SGPR to VGPR"), cl::init(true));
static bool hasFP32Denormals(const MachineFunction &MF) {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
return Info->getMode().allFP32Denormals();
@ -11485,6 +11489,59 @@ bool SITargetLowering::checkAsmConstraintValA(SDValue Op,
return false;
}
// Lower COPY from SGPR to VGPR to real one as they are real transfer instead
// of COPY.
static void lowerSGPRToVGPRCopy(MachineFunction &MF, MachineRegisterInfo &MRI,
const SIRegisterInfo &TRI,
const SIInstrInfo &TII) {
for (MachineBasicBlock &MBB : MF) {
for (auto BI = MBB.begin(), BE = MBB.end(); BI != BE; /*EMPTY*/) {
MachineInstr &MI = *BI++;
auto IsSGPRToVGPRCopy = [&MRI, &TRI](const MachineInstr &MI) {
if (!MI.isCopy())
return false;
auto DstReg = MI.getOperand(0).getReg();
auto SrcReg = MI.getOperand(1).getReg();
const auto *DstRC = DstReg.isVirtual() ? MRI.getRegClass(DstReg)
: TRI.getPhysRegClass(DstReg);
const auto *SrcRC = SrcReg.isVirtual() ? MRI.getRegClass(SrcReg)
: TRI.getPhysRegClass(SrcReg);
return (DstRC == &AMDGPU::VGPR_32RegClass ||
DstRC == &AMDGPU::VReg_64RegClass) &&
(SrcRC == &AMDGPU::SGPR_32RegClass ||
SrcRC == &AMDGPU::SGPR_64RegClass);
};
// Skip if it's not a copy from SGPR to VGPR.
if (!IsSGPRToVGPRCopy(MI))
continue;
const MachineOperand &Src = MI.getOperand(1);
// FIXME: Need subreg support.
if (Src.getSubReg() != AMDGPU::NoSubRegister)
continue;
// FIXME: Need undef support.
if (Src.getReg().isVirtual()) {
auto *DefMI = MRI.getVRegDef(Src.getReg());
if (!DefMI || DefMI->isImplicitDef())
continue;
}
LLVM_DEBUG(dbgs() << "Lower COPY: " << MI);
unsigned Opcode = (TRI.getRegSizeInBits(Src.getReg(), MRI) == 64)
? AMDGPU::V_MOV_B64_PSEUDO
: AMDGPU::V_MOV_B32_e32;
auto DstReg = MI.getOperand(0).getReg();
auto MIB = BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(Opcode), DstReg)
.add(MI.getOperand(1));
LLVM_DEBUG(dbgs() << " to: " << *MIB.getInstr());
MI.eraseFromParent();
}
}
}
// Figure out which registers should be reserved for stack access. Only after
// the function is legalized do we know all of the non-spill stack objects or if
// calls are present.
@ -11493,6 +11550,10 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
if (EnableLowerSGPRToVGPRCopy)
lowerSGPRToVGPRCopy(MF, MRI, *TRI, *TII);
if (Info->isEntryFunction()) {
// Callable functions have fixed registers used for stack access.

View File

@ -11,7 +11,7 @@
; R600-NOT: AND
; R600: |PV.{{[XYZW]}}|
; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
; SI: s_bitset0_b32 s{{[0-9]+}}, 31
; VI: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
%bc= bitcast i32 %in to float
@ -24,7 +24,7 @@ define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
; R600-NOT: AND
; R600: |PV.{{[XYZW]}}|
; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
; SI: s_bitset0_b32 s{{[0-9]+}}, 31
; VI: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
%bc= bitcast i32 %in to float
@ -36,7 +36,7 @@ define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
; FUNC-LABEL: {{^}}s_fabs_f32:
; R600: |{{(PV|T[0-9])\.[XYZW]}}|
; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
; SI: s_bitset0_b32 s{{[0-9]+}}, 31
; VI: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) {
%fabs = call float @llvm.fabs.f32(float %in)

View File

@ -34,7 +34,7 @@ define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x
; R600: |PV.{{[XYZW]}}|
; R600: -PV
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
; SI: s_bitset1_b32 s{{[0-9]+}}, 31
; VI: s_bitset1_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
%bc = bitcast i32 %in to float
@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in)
; R600: |PV.{{[XYZW]}}|
; R600: -PV
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
; SI: s_bitset1_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
%bc = bitcast i32 %in to float
%fabs = call float @fabs(float %bc)
@ -59,7 +59,7 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %
}
; FUNC-LABEL: {{^}}fneg_fabs_f32:
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
; SI: s_bitset1_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
%fabs = call float @llvm.fabs.f32(float %in)
%fsub = fsub float -0.000000e+00, %fabs

View File

@ -0,0 +1,26 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -o - %s | FileCheck %s
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
target triple = "amdgcn-amd-amdhsa"
; CHECK-LABEL: {{^}}t0:
; CHECK: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[4:5], 0x0
; CHECK-COUNT-1: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]]
; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]]
define protected amdgpu_kernel void @t0(float addrspace(1)* %p, i32 %i0, i32 %j0, i32 %k0) {
entry:
%0 = tail call i32 @llvm.amdgcn.workitem.id.x()
%i = add i32 %0, %i0
%j = add i32 %0, %j0
%k = add i32 %0, %k0
%pi = getelementptr float, float addrspace(1)* %p, i32 %i
%vi = load float, float addrspace(1)* %pi
%pj = getelementptr float, float addrspace(1)* %p, i32 %j
%vj = load float, float addrspace(1)* %pj
%sum = fadd float %vi, %vj
%pk = getelementptr float, float addrspace(1)* %p, i32 %k
store float %sum, float addrspace(1)* %pk
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()

View File

@ -153,7 +153,9 @@ bb:
; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
; GCN: flat_load_dword
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX8_9: s_waitcnt lgkmcnt(0){{$}}
; GFX8_9: s_waitcnt vmcnt(0){{$}}
; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) {

View File

@ -650,12 +650,12 @@ main_body:
; CHECK: image_store
; CHECK: s_wqm_b64 exec, exec
; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
; CHECK-DAG: v_mov_b32_e32 [[SEVEN:v[0-9]+]], 0x40e00000
; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
; CHECK: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop
; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]]
; CHECK: s_cbranch_vccz [[LOOPHDR]]
; CHECK: ; %break