forked from OSchip/llvm-project
[AMDGPU] Remove unnecessary v_mov from a register to itself in WQM lowering.
Summary: - SI Whole Quad Mode phase is replacing WQM pseudo instructions with v_mov instructions. While this is necessary for the special handling of moving results out of WWM live ranges, it is not necessary for WQM live ranges. The result is a v_mov from a register to itself after every WQM operation. This change uses a COPY psuedo in these cases, which allows the register allocator to coalesce the moves away. Reviewers: tpr, dstuttard, foad, nhaehnle Reviewed By: nhaehnle Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D71386
This commit is contained in:
parent
4134d706d9
commit
4a32cd11ac
|
@ -156,6 +156,7 @@ private:
|
|||
DenseMap<const MachineInstr *, InstrInfo> Instructions;
|
||||
DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
|
||||
SmallVector<MachineInstr *, 1> LiveMaskQueries;
|
||||
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
|
||||
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
|
||||
|
||||
void printInfo();
|
||||
|
@ -352,7 +353,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
|
|||
// inactive lanes.
|
||||
markInstructionUses(MI, StateWWM, Worklist);
|
||||
GlobalFlags |= StateWWM;
|
||||
LowerToCopyInstrs.push_back(&MI);
|
||||
LowerToMovInstrs.push_back(&MI);
|
||||
continue;
|
||||
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
|
||||
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
|
||||
|
@ -852,9 +853,8 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
|
|||
}
|
||||
|
||||
void SIWholeQuadMode::lowerCopyInstrs() {
|
||||
for (MachineInstr *MI : LowerToCopyInstrs) {
|
||||
for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
|
||||
MI->RemoveOperand(i);
|
||||
for (MachineInstr *MI : LowerToMovInstrs) {
|
||||
assert(MI->getNumExplicitOperands() == 2);
|
||||
|
||||
const Register Reg = MI->getOperand(0).getReg();
|
||||
|
||||
|
@ -872,6 +872,22 @@ void SIWholeQuadMode::lowerCopyInstrs() {
|
|||
MI->setDesc(TII->get(AMDGPU::COPY));
|
||||
}
|
||||
}
|
||||
for (MachineInstr *MI : LowerToCopyInstrs) {
|
||||
if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
|
||||
MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
|
||||
assert(MI->getNumExplicitOperands() == 3);
|
||||
// the only reason we should be here is V_SET_INACTIVE has
|
||||
// an undef input so it is being replaced by a simple copy.
|
||||
// There should be a second undef source that we should remove.
|
||||
assert(MI->getOperand(2).isUndef());
|
||||
MI->RemoveOperand(2);
|
||||
MI->untieRegOperand(1);
|
||||
} else {
|
||||
assert(MI->getNumExplicitOperands() == 2);
|
||||
}
|
||||
|
||||
MI->setDesc(TII->get(AMDGPU::COPY));
|
||||
}
|
||||
}
|
||||
|
||||
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
|
||||
|
@ -879,6 +895,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
|
|||
Blocks.clear();
|
||||
LiveMaskQueries.clear();
|
||||
LowerToCopyInstrs.clear();
|
||||
LowerToMovInstrs.clear();
|
||||
CallingConv = MF.getFunction().getCallingConv();
|
||||
|
||||
ST = &MF.getSubtarget<GCNSubtarget>();
|
||||
|
@ -893,7 +910,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
|
|||
unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
||||
if (!(GlobalFlags & StateWQM)) {
|
||||
lowerLiveMaskQueries(Exec);
|
||||
if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty())
|
||||
if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
|
||||
return !LiveMaskQueries.empty();
|
||||
} else {
|
||||
// Store a copy of the original live mask when required
|
||||
|
|
|
@ -117,6 +117,9 @@ main_body:
|
|||
;CHECK: buffer_load_dword
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
|
||||
; does not happen - the v_add should write the return reg directly.
|
||||
;CHECK-NOT: v_mov_b32_e32
|
||||
define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
|
||||
main_body:
|
||||
%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
|
||||
|
|
|
@ -48,3 +48,62 @@ body: |
|
|||
SI_RETURN_TO_EPILOG $vgpr0
|
||||
|
||||
...
|
||||
|
||||
---
|
||||
# V_SET_INACTIVE, when its second operand is undef, is replaced by a
|
||||
# COPY by si-wqm. Ensure the instruction is removed.
|
||||
#CHECK-NOT: V_SET_INACTIVE
|
||||
name: no_cfg
|
||||
alignment: 1
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
failedISel: false
|
||||
tracksRegLiveness: true
|
||||
hasWinCFI: false
|
||||
registers:
|
||||
- { id: 0, class: sgpr_32, preferred-register: '' }
|
||||
- { id: 1, class: sgpr_32, preferred-register: '' }
|
||||
- { id: 2, class: sgpr_32, preferred-register: '' }
|
||||
- { id: 3, class: sgpr_32, preferred-register: '' }
|
||||
- { id: 4, class: sgpr_32, preferred-register: '' }
|
||||
- { id: 5, class: sgpr_128, preferred-register: '' }
|
||||
- { id: 6, class: sgpr_128, preferred-register: '' }
|
||||
- { id: 7, class: sreg_32, preferred-register: '' }
|
||||
- { id: 8, class: vreg_64, preferred-register: '' }
|
||||
- { id: 9, class: sreg_32, preferred-register: '' }
|
||||
- { id: 10, class: vgpr_32, preferred-register: '' }
|
||||
- { id: 11, class: vgpr_32, preferred-register: '' }
|
||||
- { id: 12, class: sreg_32, preferred-register: '' }
|
||||
- { id: 13, class: vgpr_32, preferred-register: '' }
|
||||
- { id: 14, class: vgpr_32, preferred-register: '' }
|
||||
- { id: 15, class: vgpr_32, preferred-register: '' }
|
||||
- { id: 16, class: vgpr_32, preferred-register: '' }
|
||||
liveins:
|
||||
- { reg: '$sgpr0', virtual-reg: '%0' }
|
||||
- { reg: '$sgpr1', virtual-reg: '%1' }
|
||||
- { reg: '$sgpr2', virtual-reg: '%2' }
|
||||
- { reg: '$sgpr3', virtual-reg: '%3' }
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
|
||||
|
||||
%3:sgpr_32 = COPY $sgpr3
|
||||
%2:sgpr_32 = COPY $sgpr2
|
||||
%1:sgpr_32 = COPY $sgpr1
|
||||
%0:sgpr_32 = COPY $sgpr0
|
||||
%6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
|
||||
%5:sgpr_128 = COPY %6
|
||||
%7:sreg_32 = S_MOV_B32 0
|
||||
%8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %6, %7, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
%16:vgpr_32 = COPY %8.sub1
|
||||
%11:vgpr_32 = COPY %16
|
||||
%10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec
|
||||
%14:vgpr_32 = COPY %7
|
||||
%13:vgpr_32 = V_MOV_B32_dpp %14, killed %10, 323, 12, 15, 0, implicit $exec
|
||||
early-clobber %15:vgpr_32 = WWM killed %13, implicit $exec
|
||||
BUFFER_STORE_DWORD_OFFSET_exact killed %15, %6, %7, 4, 0, 0, 0, 0, 0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
|
||||
...
|
||||
|
|
Loading…
Reference in New Issue