forked from OSchip/llvm-project
AMDGPU: Fix liveness when expanding m0 loop
llvm-svn: 273514
This commit is contained in:
parent
5dae789a16
commit
3cb4ddeb4e
|
@ -2020,17 +2020,18 @@ def SI_RETURN : InstSI <
|
|||
let hasNoSchedulingInfo = 1;
|
||||
}
|
||||
|
||||
let Uses = [EXEC], Defs = [EXEC, VCC, M0] in {
|
||||
let Uses = [EXEC], Defs = [EXEC, VCC, M0],
|
||||
UseNamedOperandTable = 1 in {
|
||||
|
||||
class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
|
||||
(outs VGPR_32:$dst, SReg_64:$temp),
|
||||
(ins rc:$src, VSrc_32:$idx, i32imm:$off)
|
||||
(outs VGPR_32:$vdst, SReg_64:$sdst),
|
||||
(ins rc:$src, VSrc_32:$idx, i32imm:$offset)
|
||||
>;
|
||||
|
||||
class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
|
||||
(outs rc:$dst, SReg_64:$temp),
|
||||
(ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val)> {
|
||||
let Constraints = "$src = $dst";
|
||||
(outs rc:$vdst, SReg_64:$sdst),
|
||||
(ins unknown:$src, VSrc_32:$idx, i32imm:$offset, VGPR_32:$val)> {
|
||||
let Constraints = "$src = $vdst";
|
||||
}
|
||||
|
||||
// TODO: We can support indirect SGPR access.
|
||||
|
|
|
@ -52,6 +52,7 @@
|
|||
#include "AMDGPUSubtarget.h"
|
||||
#include "SIInstrInfo.h"
|
||||
#include "SIMachineFunctionInfo.h"
|
||||
#include "llvm/CodeGen/LivePhysRegs.h"
|
||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
|
@ -88,9 +89,15 @@ private:
|
|||
void Kill(MachineInstr &MI);
|
||||
void Branch(MachineInstr &MI);
|
||||
|
||||
void splitBlockLiveIns(const MachineBasicBlock &MBB,
|
||||
const MachineInstr &MI,
|
||||
MachineBasicBlock &LoopBB,
|
||||
MachineBasicBlock &RemainderBB,
|
||||
unsigned SaveReg,
|
||||
unsigned IdxReg);
|
||||
|
||||
void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
|
||||
MachineInstr *MovRel,
|
||||
unsigned SaveReg, unsigned IdxReg, int Offset);
|
||||
MachineInstr *MovRel, unsigned IdxReg, int Offset);
|
||||
|
||||
bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
|
||||
void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
|
||||
|
@ -373,10 +380,41 @@ void SILowerControlFlow::Kill(MachineInstr &MI) {
|
|||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
// All currently live registers must remain so in the remainder block.
|
||||
void SILowerControlFlow::splitBlockLiveIns(const MachineBasicBlock &MBB,
|
||||
const MachineInstr &MI,
|
||||
MachineBasicBlock &LoopBB,
|
||||
MachineBasicBlock &RemainderBB,
|
||||
unsigned SaveReg,
|
||||
unsigned IdxReg) {
|
||||
LivePhysRegs RemainderLiveRegs(TRI);
|
||||
|
||||
RemainderLiveRegs.addLiveOuts(MBB);
|
||||
for (MachineBasicBlock::const_reverse_iterator I = MBB.rbegin(), E(&MI);
|
||||
I != E; ++I) {
|
||||
RemainderLiveRegs.stepBackward(*I);
|
||||
}
|
||||
|
||||
// Add reg defined in loop body.
|
||||
RemainderLiveRegs.addReg(SaveReg);
|
||||
|
||||
if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) {
|
||||
RemainderLiveRegs.addReg(Val->getReg());
|
||||
LoopBB.addLiveIn(Val->getReg());
|
||||
}
|
||||
|
||||
for (unsigned Reg : RemainderLiveRegs)
|
||||
RemainderBB.addLiveIn(Reg);
|
||||
|
||||
unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
|
||||
LoopBB.addLiveIn(SrcReg);
|
||||
LoopBB.addLiveIn(IdxReg);
|
||||
LoopBB.sortUniqueLiveIns();
|
||||
}
|
||||
|
||||
void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
|
||||
DebugLoc DL,
|
||||
MachineInstr *MovRel,
|
||||
unsigned SaveReg,
|
||||
unsigned IdxReg,
|
||||
int Offset) {
|
||||
MachineBasicBlock::iterator I = LoopBB.begin();
|
||||
|
@ -421,9 +459,9 @@ void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
|
|||
bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
MachineBasicBlock::iterator I = MI;
|
||||
MachineBasicBlock::iterator I(&MI);
|
||||
|
||||
unsigned Idx = MI.getOperand(3).getReg();
|
||||
unsigned Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx)->getReg();
|
||||
|
||||
if (AMDGPU::SReg_32RegClass.contains(Idx)) {
|
||||
if (Offset) {
|
||||
|
@ -441,14 +479,16 @@ bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offs
|
|||
}
|
||||
|
||||
MachineFunction &MF = *MBB.getParent();
|
||||
unsigned Save = MI.getOperand(1).getReg();
|
||||
MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
|
||||
SaveOp->setIsDead(false);
|
||||
unsigned Save = SaveOp->getReg();
|
||||
|
||||
// Reading from a VGPR requires looping over all workitems in the wavefront.
|
||||
assert(AMDGPU::SReg_64RegClass.contains(Save) &&
|
||||
AMDGPU::VGPR_32RegClass.contains(Idx));
|
||||
|
||||
// Save the EXEC mask
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
|
||||
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
|
||||
.addReg(AMDGPU::EXEC);
|
||||
|
||||
// To insert the loop we need to split the block. Move everything after this
|
||||
|
@ -464,11 +504,14 @@ bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offs
|
|||
LoopBB->addSuccessor(LoopBB);
|
||||
LoopBB->addSuccessor(RemainderBB);
|
||||
|
||||
if (TRI->trackLivenessAfterRegAlloc(MF))
|
||||
splitBlockLiveIns(MBB, MI, *LoopBB, *RemainderBB, Save, Idx);
|
||||
|
||||
// Move the rest of the block into a new block.
|
||||
RemainderBB->transferSuccessors(&MBB);
|
||||
RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
|
||||
|
||||
emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, Save, Idx, Offset);
|
||||
emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, Idx, Offset);
|
||||
|
||||
MachineBasicBlock::iterator First = RemainderBB->begin();
|
||||
BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
|
||||
|
@ -511,16 +554,16 @@ bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
|
|||
DebugLoc DL = MI.getDebugLoc();
|
||||
|
||||
unsigned Dst = MI.getOperand(0).getReg();
|
||||
unsigned Vec = MI.getOperand(2).getReg();
|
||||
int Off = MI.getOperand(4).getImm();
|
||||
unsigned Vec = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
|
||||
int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
|
||||
unsigned Reg;
|
||||
|
||||
computeIndirectRegAndOffset(Vec, Reg, Off);
|
||||
|
||||
MachineInstr *MovRel =
|
||||
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
|
||||
.addReg(Reg)
|
||||
.addReg(Vec, RegState::Implicit);
|
||||
.addReg(Reg)
|
||||
.addReg(Vec, RegState::Implicit);
|
||||
|
||||
return loadM0(MI, MovRel, Off);
|
||||
}
|
||||
|
@ -531,17 +574,17 @@ bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
|
|||
DebugLoc DL = MI.getDebugLoc();
|
||||
|
||||
unsigned Dst = MI.getOperand(0).getReg();
|
||||
int Off = MI.getOperand(4).getImm();
|
||||
unsigned Val = MI.getOperand(5).getReg();
|
||||
int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
|
||||
unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)->getReg();
|
||||
unsigned Reg;
|
||||
|
||||
computeIndirectRegAndOffset(Dst, Reg, Off);
|
||||
|
||||
MachineInstr *MovRel =
|
||||
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
|
||||
.addReg(Reg, RegState::Define)
|
||||
.addReg(Val)
|
||||
.addReg(Dst, RegState::Implicit);
|
||||
.addReg(Reg, RegState::Define)
|
||||
.addReg(Val)
|
||||
.addReg(Dst, RegState::Implicit);
|
||||
|
||||
return loadM0(MI, MovRel, Off);
|
||||
}
|
||||
|
|
|
@ -220,9 +220,18 @@ entry:
|
|||
%idx0 = load volatile i32, i32 addrspace(1)* %gep
|
||||
%idx1 = add i32 %idx0, 1
|
||||
%val0 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx0
|
||||
%live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={SGPR4}" ()
|
||||
%val1 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx1
|
||||
store volatile i32 %val0, i32 addrspace(1)* %out0
|
||||
store volatile i32 %val1, i32 addrspace(1)* %out0
|
||||
%cmp = icmp eq i32 %id, 0
|
||||
br i1 %cmp, label %bb1, label %bb2
|
||||
|
||||
bb1:
|
||||
store volatile i32 %live.out.reg, i32 addrspace(1)* undef
|
||||
br label %bb2
|
||||
|
||||
bb2:
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -230,7 +239,7 @@ entry:
|
|||
; CHECK-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
|
||||
; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
|
||||
; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], s[[S_ELT0]]
|
||||
; CHECK-DAG: v_mov_b32_e32 [[INS0:v[0-9]+]], 62
|
||||
; CHECK-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
|
||||
; CHECK-DAG: s_waitcnt vmcnt(0)
|
||||
|
||||
; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
|
||||
|
@ -259,6 +268,8 @@ entry:
|
|||
; CHECK: s_cbranch_execnz [[LOOP1]]
|
||||
|
||||
; CHECK: buffer_store_dwordx4 v{{\[}}[[MOVREL0]]:
|
||||
|
||||
; CHECK: buffer_store_dword [[INS0]]
|
||||
define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
|
||||
entry:
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
@ -266,9 +277,18 @@ entry:
|
|||
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
|
||||
%idx0 = load volatile i32, i32 addrspace(1)* %gep
|
||||
%idx1 = add i32 %idx0, 1
|
||||
%vec1 = insertelement <4 x i32> %vec0, i32 62, i32 %idx0
|
||||
%live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
|
||||
%vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0
|
||||
%vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
|
||||
store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
|
||||
%cmp = icmp eq i32 %id, 0
|
||||
br i1 %cmp, label %bb1, label %bb2
|
||||
|
||||
bb1:
|
||||
store volatile i32 %live.out.val, i32 addrspace(1)* undef
|
||||
br label %bb2
|
||||
|
||||
bb2:
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue