forked from OSchip/llvm-project
AMDGPU: Track physreg uses in SILoadStoreOptimizer
Summary: This handles def-after-use of physregs, and allows us to merge loads and stores even across some physreg defs (typically M0 defs). Change-Id: I076484b2bda27c2cf46013c845a0380c5b89b67b Reviewers: arsenm, mareko, rampitec Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D42647 llvm-svn: 325882
This commit is contained in:
parent
43c1115cd4
commit
6cf306deca
|
@ -173,10 +173,17 @@ static void moveInstsAfter(MachineBasicBlock::iterator I,
|
|||
}
|
||||
}
|
||||
|
||||
static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
|
||||
for (const MachineOperand &Def : MI.operands()) {
|
||||
if (Def.isReg() && Def.isDef())
|
||||
Defs.insert(Def.getReg());
|
||||
static void addDefsUsesToList(const MachineInstr &MI,
|
||||
DenseSet<unsigned> &RegDefs,
|
||||
DenseSet<unsigned> &PhysRegUses) {
|
||||
for (const MachineOperand &Op : MI.operands()) {
|
||||
if (Op.isReg()) {
|
||||
if (Op.isDef())
|
||||
RegDefs.insert(Op.getReg());
|
||||
else if (Op.readsReg() &&
|
||||
TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
|
||||
PhysRegUses.insert(Op.getReg());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -195,16 +202,24 @@ static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
|
|||
// already in the list. Returns true in that case.
|
||||
static bool
|
||||
addToListsIfDependent(MachineInstr &MI,
|
||||
DenseSet<unsigned> &Defs,
|
||||
DenseSet<unsigned> &RegDefs,
|
||||
DenseSet<unsigned> &PhysRegUses,
|
||||
SmallVectorImpl<MachineInstr*> &Insts) {
|
||||
for (MachineOperand &Use : MI.operands()) {
|
||||
// If one of the defs is read, then there is a use of Def between I and the
|
||||
// instruction that I will potentially be merged with. We will need to move
|
||||
// this instruction after the merged instructions.
|
||||
|
||||
if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
|
||||
//
|
||||
// Similarly, if there is a def which is read by an instruction that is to
|
||||
// be moved for merging, then we need to move the def-instruction as well.
|
||||
// This can only happen for physical registers such as M0; virtual
|
||||
// registers are in SSA form.
|
||||
if (Use.isReg() &&
|
||||
((Use.readsReg() && RegDefs.count(Use.getReg())) ||
|
||||
(Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
|
||||
PhysRegUses.count(Use.getReg())))) {
|
||||
Insts.push_back(&MI);
|
||||
addDefsToList(MI, Defs);
|
||||
addDefsUsesToList(MI, RegDefs, PhysRegUses);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -228,16 +243,6 @@ canMoveInstsAcrossMemOp(MachineInstr &MemOp,
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
hasPhysRegDef(MachineInstr &MI) {
|
||||
for (const MachineOperand &Def : MI.defs()) {
|
||||
if (Def.isReg() &&
|
||||
TargetRegisterInfo::isPhysicalRegister(Def.getReg()))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
|
||||
// XXX - Would the same offset be OK? Is there any reason this would happen or
|
||||
// be useful?
|
||||
|
@ -343,8 +348,9 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
|
|||
|
||||
++MBBI;
|
||||
|
||||
DenseSet<unsigned> DefsToMove;
|
||||
addDefsToList(*CI.I, DefsToMove);
|
||||
DenseSet<unsigned> RegDefsToMove;
|
||||
DenseSet<unsigned> PhysRegUsesToMove;
|
||||
addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
|
||||
|
||||
for ( ; MBBI != E; ++MBBI) {
|
||||
if (MBBI->getOpcode() != CI.I->getOpcode()) {
|
||||
|
@ -360,13 +366,6 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (hasPhysRegDef(*MBBI)) {
|
||||
// We could re-order this instruction in theory, but it would require
|
||||
// tracking physreg defs and uses. This should only affect M0 in
|
||||
// practice.
|
||||
return false;
|
||||
}
|
||||
|
||||
if (MBBI->mayLoadOrStore() &&
|
||||
(!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
|
||||
!canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
|
||||
|
@ -374,14 +373,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
|
|||
// #2. Add this instruction to the move list and then we will check
|
||||
// if condition #2 holds once we have selected the matching instruction.
|
||||
CI.InstsToMove.push_back(&*MBBI);
|
||||
addDefsToList(*MBBI, DefsToMove);
|
||||
addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
|
||||
continue;
|
||||
}
|
||||
|
||||
// When we match I with another DS instruction we will be moving I down
|
||||
// to the location of the matched instruction any uses of I will need to
|
||||
// be moved down as well.
|
||||
addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
|
||||
addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
|
||||
CI.InstsToMove);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -395,7 +395,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
|
|||
// DS_WRITE_B32 addr, f(w), idx1
|
||||
// where the DS_READ_B32 ends up in InstsToMove and therefore prevents
|
||||
// merging of the two writes.
|
||||
if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
|
||||
if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
|
||||
CI.InstsToMove))
|
||||
continue;
|
||||
|
||||
bool Match = true;
|
||||
|
@ -454,8 +455,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
|
|||
// down past this instruction.
|
||||
// check if we can move I across MBBI and if we can move all I's users
|
||||
if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
|
||||
!canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA) ||
|
||||
hasPhysRegDef(*MBBI))
|
||||
!canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
|
|
|
@ -629,6 +629,27 @@ define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspa
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}ds_read_interp_read:
|
||||
; CI: s_mov_b32 m0, -1
|
||||
; CI: ds_read_b32
|
||||
; CI: s_mov_b32 m0, s0
|
||||
; CI: v_interp_mov_f32
|
||||
; CI: s_mov_b32 m0, -1
|
||||
; CI: ds_read_b32
|
||||
; GFX9: ds_read2_b32 v[0:1], v0 offset1:4
|
||||
; GFX9: s_mov_b32 m0, s0
|
||||
; GFX9: v_interp_mov_f32
|
||||
define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrspace(3)* %inptr) {
|
||||
%v0 = load float, float addrspace(3)* %inptr, align 4
|
||||
%intrp = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 %prims)
|
||||
%ptr1 = getelementptr float, float addrspace(3)* %inptr, i32 4
|
||||
%v1 = load float, float addrspace(3)* %ptr1, align 4
|
||||
%v1b = fadd float %v1, %intrp
|
||||
%r0 = insertelement <2 x float> undef, float %v0, i32 0
|
||||
%r1 = insertelement <2 x float> %r0, float %v1b, i32 1
|
||||
ret <2 x float> %r1
|
||||
}
|
||||
|
||||
declare void @void_func_void() #3
|
||||
|
||||
declare i32 @llvm.amdgcn.workgroup.id.x() #1
|
||||
|
@ -636,6 +657,8 @@ declare i32 @llvm.amdgcn.workgroup.id.y() #1
|
|||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
declare i32 @llvm.amdgcn.workitem.id.y() #1
|
||||
|
||||
declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) nounwind readnone
|
||||
|
||||
declare void @llvm.amdgcn.s.barrier() #2
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
|
|
@ -0,0 +1,116 @@
|
|||
# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s
|
||||
|
||||
# Check that SILoadStoreOptimizer honors physregs defs/uses between moved
|
||||
# instructions.
|
||||
#
|
||||
# The following IR snippet would usually be optimized by the peephole optimizer.
|
||||
# However, an equivalent situation can occur with buffer instructions as well.
|
||||
|
||||
# CHECK-LABEL: name: scc_def_and_use_no_dependency
|
||||
# CHECK: S_ADD_U32
|
||||
# CHECK: S_ADDC_U32
|
||||
# CHECK: DS_READ2_B32
|
||||
--- |
|
||||
define amdgpu_kernel void @scc_def_and_use_no_dependency(i32 addrspace(3)* %ptr.0) nounwind {
|
||||
%ptr.4 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 1
|
||||
%ptr.64 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 16
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @scc_def_and_use_dependency(i32 addrspace(3)* %ptr.0) nounwind {
|
||||
%ptr.4 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 1
|
||||
%ptr.64 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 16
|
||||
ret void
|
||||
}
|
||||
...
|
||||
---
|
||||
name: scc_def_and_use_no_dependency
|
||||
alignment: 0
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
tracksRegLiveness: false
|
||||
liveins:
|
||||
- { reg: '$vgpr0' }
|
||||
- { reg: '$sgpr0' }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 0
|
||||
adjustsStack: false
|
||||
hasCalls: false
|
||||
maxCallFrameSize: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $sgpr0
|
||||
|
||||
%1:vgpr_32 = COPY $vgpr0
|
||||
%10:sgpr_32 = COPY $sgpr0
|
||||
|
||||
$m0 = S_MOV_B32 -1
|
||||
%2:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.0)
|
||||
|
||||
%11:sgpr_32 = S_ADD_U32 %10, 4, implicit-def $scc
|
||||
%12:sgpr_32 = S_ADDC_U32 %10, 0, implicit-def dead $scc, implicit $scc
|
||||
|
||||
%3:vgpr_32 = DS_READ_B32 %1, 64, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.64)
|
||||
S_ENDPGM
|
||||
|
||||
...
|
||||
|
||||
# CHECK-LABEL: name: scc_def_and_use_dependency
|
||||
# CHECK: DS_READ2_B32
|
||||
# CHECK: S_ADD_U32
|
||||
# CHECK: S_ADDC_U32
|
||||
---
|
||||
name: scc_def_and_use_dependency
|
||||
alignment: 0
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
tracksRegLiveness: false
|
||||
liveins:
|
||||
- { reg: '$vgpr0' }
|
||||
- { reg: '$sgpr0' }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 0
|
||||
adjustsStack: false
|
||||
hasCalls: false
|
||||
maxCallFrameSize: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $sgpr0
|
||||
|
||||
%1:vgpr_32 = COPY $vgpr0
|
||||
%10:sgpr_32 = COPY $sgpr0
|
||||
|
||||
$m0 = S_MOV_B32 -1
|
||||
%2:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.0)
|
||||
%20:sgpr_32 = V_READFIRSTLANE_B32 %2, implicit $exec
|
||||
|
||||
%21:sgpr_32 = S_ADD_U32 %20, 4, implicit-def $scc
|
||||
; The S_ADDC_U32 depends on the first DS_READ_B32 only via SCC
|
||||
%11:sgpr_32 = S_ADDC_U32 %10, 0, implicit-def dead $scc, implicit $scc
|
||||
|
||||
%3:vgpr_32 = DS_READ_B32 %1, 64, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.64)
|
||||
S_ENDPGM
|
||||
|
||||
...
|
|
@ -1,6 +1,6 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SICI -check-prefix=SIVIGFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=SICI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI -check-prefix=SIVIGFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s
|
||||
|
||||
; SMRD load with an immediate offset.
|
||||
|
@ -232,15 +232,24 @@ main_body:
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}smrd_imm_nomerge_m0:
|
||||
; GCN-LABEL: {{^}}smrd_imm_merge_m0:
|
||||
;
|
||||
; In principle we could merge the loads here as well, but it would require
|
||||
; careful tracking of physical registers since both v_interp* and v_movrel*
|
||||
; instructions (or gpr idx mode) use M0.
|
||||
; SICIVI: s_buffer_load_dwordx2
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; SICIVI_DAG: v_interp_p1_f32
|
||||
; SICIVI_DAG: v_interp_p1_f32
|
||||
; SICIVI_DAG: v_interp_p1_f32
|
||||
; SICIVI_DAG: v_interp_p2_f32
|
||||
; SICIVI_DAG: v_interp_p2_f32
|
||||
; SICIVI_DAG: v_interp_p2_f32
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; SICIVI: v_movrels_b32_e32
|
||||
;
|
||||
; GCN: s_buffer_load_dword
|
||||
; GCN: s_buffer_load_dword
|
||||
define amdgpu_ps float @smrd_imm_nomerge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
|
||||
; Merging is still thwarted on GFX9 due to s_set_gpr_idx
|
||||
;
|
||||
; GFX9: s_buffer_load_dword
|
||||
; GFX9: s_buffer_load_dword
|
||||
define amdgpu_ps float @smrd_imm_merge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
|
||||
main_body:
|
||||
%idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0)
|
||||
%idx1 = bitcast float %idx1.f to i32
|
||||
|
|
Loading…
Reference in New Issue