AMDGPU/SI: Turn off GPR Indexing Mode immediately after the interested instruction.

Summary:
  In the current implementation of GPR Indexing Mode when the index is of non-uniform, the s_set_gpr_idx_off instruction
is incorrectly inserted after the loop. This will lead the instructions with vgpr operands (v_readfirstlane for example) to read incorrect
vgpr.
 In this patch, we fix the issue by inserting s_set_gpr_idx_on/off immediately around the interested instruction.

Reviewers:
  rampitec

Differential Revision:
  https://reviews.llvm.org/D43297

llvm-svn: 325355
This commit is contained in:
Changpeng Fang 2018-02-16 16:31:30 +00:00
parent ff53a4a234
commit da38b5fd49
2 changed files with 46 additions and 77 deletions

View File

@ -2744,7 +2744,8 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
unsigned PhiReg,
unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode) {
bool UseGPRIdxMode,
bool IsIndirectSrc) {
MachineBasicBlock::iterator I = LoopBB.begin();
unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@ -2773,6 +2774,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
.addReg(CurrentIdxReg)
.addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
// Update EXEC, save the original EXEC value to VCC.
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
.addReg(CondReg, RegState::Kill);
MRI.setSimpleHint(NewExec, CondReg);
if (UseGPRIdxMode) {
unsigned IdxReg;
if (Offset == 0) {
@ -2783,11 +2790,13 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
.addReg(CurrentIdxReg, RegState::Kill)
.addImm(Offset);
}
MachineInstr *SetIdx =
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
.addReg(IdxReg, RegState::Kill);
SetIdx->getOperand(2).setIsUndef();
unsigned IdxMode = IsIndirectSrc ?
VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
MachineInstr *SetOn =
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
.addReg(IdxReg, RegState::Kill)
.addImm(IdxMode);
SetOn->getOperand(3).setIsUndef();
} else {
// Move index from VCC into M0
if (Offset == 0) {
@ -2800,12 +2809,6 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
}
}
// Update EXEC, save the original EXEC value to VCC.
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
.addReg(CondReg, RegState::Kill);
MRI.setSimpleHint(NewExec, CondReg);
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
MachineInstr *InsertPt =
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
@ -2833,7 +2836,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
unsigned InitResultReg,
unsigned PhiReg,
int Offset,
bool UseGPRIdxMode) {
bool UseGPRIdxMode,
bool IsIndirectSrc) {
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
@ -2872,7 +2876,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
InitResultReg, DstReg, PhiReg, TmpExec,
Offset, UseGPRIdxMode);
Offset, UseGPRIdxMode, IsIndirectSrc);
MachineBasicBlock::iterator First = RemainderBB->begin();
BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
@ -3007,17 +3011,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
if (UseGPRIdxMode) {
MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
.addImm(0) // Reset inside loop.
.addImm(VGPRIndexMode::SRC0_ENABLE);
SetOn->getOperand(3).setIsUndef();
// Disable again after the loop.
BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
}
auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
Offset, UseGPRIdxMode, true);
MachineBasicBlock *LoopBB = InsPt->getParent();
if (UseGPRIdxMode) {
@ -3025,6 +3020,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
.addReg(SrcReg, RegState::Undef, SubReg)
.addReg(SrcReg, RegState::Implicit)
.addReg(AMDGPU::M0, RegState::Implicit);
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
} else {
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
.addReg(SrcReg, RegState::Undef, SubReg)
@ -3125,22 +3121,10 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
const DebugLoc &DL = MI.getDebugLoc();
if (UseGPRIdxMode) {
MachineBasicBlock::iterator I(&MI);
MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
.addImm(0) // Reset inside loop.
.addImm(VGPRIndexMode::DST_ENABLE);
SetOn->getOperand(3).setIsUndef();
// Disable again after the loop.
BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
}
unsigned PhiReg = MRI.createVirtualRegister(VecRC);
auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
Offset, UseGPRIdxMode);
Offset, UseGPRIdxMode, false);
MachineBasicBlock *LoopBB = InsPt->getParent();
if (UseGPRIdxMode) {
@ -3150,6 +3134,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
.addReg(Dst, RegState::ImplicitDefine)
.addReg(PhiReg, RegState::Implicit)
.addReg(AMDGPU::M0, RegState::Implicit);
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
} else {
const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));

View File

@ -118,23 +118,21 @@ entry:
; The offset depends on the register that holds the first element of the vector.
; FIXME: The waitcnt for the argument load can go after the loop
; IDXMODE: s_set_gpr_idx_on 0, src0
; GCN: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v{{[0-9]+}}
; GCN: s_and_saveexec_b64 vcc, vcc
; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe0
; MOVREL: s_and_saveexec_b64 vcc, vcc
; MOVREL: v_movrels_b32_e32 [[RESULT:v[0-9]+]], v1
; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00
; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
; IDXMODE: s_and_saveexec_b64 vcc, vcc
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], src0
; IDXMODE: v_mov_b32_e32 [[RESULT:v[0-9]+]], v1
; IDXMODE: s_set_gpr_idx_off
; GCN: s_cbranch_execnz
; IDXMODE: s_set_gpr_idx_off
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
entry:
@ -250,21 +248,19 @@ entry:
; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
; GCN: s_and_saveexec_b64 vcc, vcc
; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00
; MOVREL: s_and_saveexec_b64 vcc, vcc
; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 5
; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
; IDXMODE: s_and_saveexec_b64 vcc, vcc
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 5
; IDXMODE: s_set_gpr_idx_off
; GCN: s_cbranch_execnz [[LOOPBB]]
; GCN: s_mov_b64 exec, [[SAVEEXEC]]
; IDXMODE: s_set_gpr_idx_off
; GCN: buffer_store_dword
define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
entry:
@ -283,8 +279,6 @@ entry:
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}}
; IDXMODE: s_set_gpr_idx_on 0, dst
; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
; The offset depends on the register that holds the first element of the vector.
@ -294,12 +288,11 @@ entry:
; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], [[VAL]]
; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[READLANE]], -16
; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
; IDXMODE: v_mov_b32_e32 [[VEC_ELT0]], [[VAL]]
; IDXMODE: s_set_gpr_idx_off
; GCN: s_cbranch_execnz
; IDXMODE: s_set_gpr_idx_off
define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
entry:
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
@ -322,52 +315,46 @@ entry:
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
; IDXMODE: s_set_gpr_idx_on 0, src0
; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; GCN: s_and_saveexec_b64 vcc, vcc
; MOVREL: s_mov_b32 m0, [[READLANE]]
; MOVREL: s_and_saveexec_b64 vcc, vcc
; MOVREL: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
; IDXMODE: s_and_saveexec_b64 vcc, vcc
; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0
; IDXMODE: v_mov_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
; IDXMODE: s_set_gpr_idx_off
; GCN-NEXT: s_xor_b64 exec, exec, vcc
; GCN-NEXT: s_cbranch_execnz [[LOOP0]]
; FIXME: Redundant copy
; GCN: s_mov_b64 exec, [[MASK]]
; IDXMODE: s_set_gpr_idx_off
; GCN: v_mov_b32_e32 [[VEC_ELT1_2:v[0-9]+]], [[S_ELT1]]
; IDXMODE: s_set_gpr_idx_on 0, src0
; GCN: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec
; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; GCN: s_and_saveexec_b64 vcc, vcc
; MOVREL: s_mov_b32 m0, [[READLANE]]
; MOVREL: s_and_saveexec_b64 vcc, vcc
; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
; IDXMODE: s_and_saveexec_b64 vcc, vcc
; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0
; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
; IDXMODE: s_set_gpr_idx_off
; GCN-NEXT: s_xor_b64 exec, exec, vcc
; GCN: s_cbranch_execnz [[LOOP1]]
; IDXMODE: s_set_gpr_idx_off
; GCN: buffer_store_dword [[MOVREL0]]
; GCN: buffer_store_dword [[MOVREL1]]
define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
@ -403,42 +390,38 @@ bb2:
; GCN: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
; GCN: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
; IDXMODE: s_set_gpr_idx_on 0, dst
; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; GCN: s_and_saveexec_b64 vcc, vcc
; MOVREL: s_mov_b32 m0, [[READLANE]]
; MOVREL: s_and_saveexec_b64 vcc, vcc
; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]]
; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
; IDXMODE: s_and_saveexec_b64 vcc, vcc
; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]]
; IDXMODE: s_set_gpr_idx_off
; GCN-NEXT: s_xor_b64 exec, exec, vcc
; GCN: s_cbranch_execnz [[LOOP0]]
; FIXME: Redundant copy
; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]]
; IDXMODE: s_set_gpr_idx_off
; IDXMODE: s_set_gpr_idx_on 0, dst
; GCN: s_mov_b64 [[MASK]], exec
; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; GCN: s_and_saveexec_b64 vcc, vcc
; MOVREL: s_mov_b32 m0, [[READLANE]]
; MOVREL: s_and_saveexec_b64 vcc, vcc
; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
; IDXMODE: s_and_saveexec_b64 vcc, vcc
; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
; IDXMODE: s_set_gpr_idx_off
; GCN-NEXT: s_xor_b64 exec, exec, vcc
; GCN: s_cbranch_execnz [[LOOP1]]
@ -639,7 +622,6 @@ define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)
; GCN: {{^BB[0-9]+_[0-9]+}}:
; GCN: s_mov_b64 exec,
; IDXMODE: s_set_gpr_idx_off
; GCN: [[BB2]]:
; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
@ -648,8 +630,10 @@ define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)
; GCN: [[REGLOOP:BB[0-9]+_[0-9]+]]:
; MOVREL: v_movreld_b32_e32
; IDXMODE: s_set_gpr_idx_idx
; IDXMODE: s_set_gpr_idx_on
; IDXMODE: v_mov_b32_e32
; IDXMODE: s_set_gpr_idx_off
; GCN: s_cbranch_execnz [[REGLOOP]]
define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
bb: