forked from OSchip/llvm-project
AMDGPU/SI: Turn off GPR Indexing Mode immediately after the interested instruction.
Summary: In the current implementation of GPR Indexing Mode when the index is of non-uniform, the s_set_gpr_idx_off instruction is incorrectly inserted after the loop. This will lead the instructions with vgpr operands (v_readfirstlane for example) to read incorrect vgpr. In this patch, we fix the issue by inserting s_set_gpr_idx_on/off immediately around the interested instruction. Reviewers: rampitec Differential Revision: https://reviews.llvm.org/D43297 llvm-svn: 325355
This commit is contained in:
parent
ff53a4a234
commit
da38b5fd49
|
@ -2744,7 +2744,8 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
|
|||
unsigned PhiReg,
|
||||
unsigned InitSaveExecReg,
|
||||
int Offset,
|
||||
bool UseGPRIdxMode) {
|
||||
bool UseGPRIdxMode,
|
||||
bool IsIndirectSrc) {
|
||||
MachineBasicBlock::iterator I = LoopBB.begin();
|
||||
|
||||
unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
||||
|
@ -2773,6 +2774,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
|
|||
.addReg(CurrentIdxReg)
|
||||
.addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
|
||||
|
||||
// Update EXEC, save the original EXEC value to VCC.
|
||||
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
|
||||
.addReg(CondReg, RegState::Kill);
|
||||
|
||||
MRI.setSimpleHint(NewExec, CondReg);
|
||||
|
||||
if (UseGPRIdxMode) {
|
||||
unsigned IdxReg;
|
||||
if (Offset == 0) {
|
||||
|
@ -2783,11 +2790,13 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
|
|||
.addReg(CurrentIdxReg, RegState::Kill)
|
||||
.addImm(Offset);
|
||||
}
|
||||
|
||||
MachineInstr *SetIdx =
|
||||
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
|
||||
.addReg(IdxReg, RegState::Kill);
|
||||
SetIdx->getOperand(2).setIsUndef();
|
||||
unsigned IdxMode = IsIndirectSrc ?
|
||||
VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
|
||||
MachineInstr *SetOn =
|
||||
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
|
||||
.addReg(IdxReg, RegState::Kill)
|
||||
.addImm(IdxMode);
|
||||
SetOn->getOperand(3).setIsUndef();
|
||||
} else {
|
||||
// Move index from VCC into M0
|
||||
if (Offset == 0) {
|
||||
|
@ -2800,12 +2809,6 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
|
|||
}
|
||||
}
|
||||
|
||||
// Update EXEC, save the original EXEC value to VCC.
|
||||
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
|
||||
.addReg(CondReg, RegState::Kill);
|
||||
|
||||
MRI.setSimpleHint(NewExec, CondReg);
|
||||
|
||||
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
|
||||
MachineInstr *InsertPt =
|
||||
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
|
||||
|
@ -2833,7 +2836,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
|
|||
unsigned InitResultReg,
|
||||
unsigned PhiReg,
|
||||
int Offset,
|
||||
bool UseGPRIdxMode) {
|
||||
bool UseGPRIdxMode,
|
||||
bool IsIndirectSrc) {
|
||||
MachineFunction *MF = MBB.getParent();
|
||||
MachineRegisterInfo &MRI = MF->getRegInfo();
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
|
@ -2872,7 +2876,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
|
|||
|
||||
auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
|
||||
InitResultReg, DstReg, PhiReg, TmpExec,
|
||||
Offset, UseGPRIdxMode);
|
||||
Offset, UseGPRIdxMode, IsIndirectSrc);
|
||||
|
||||
MachineBasicBlock::iterator First = RemainderBB->begin();
|
||||
BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
|
||||
|
@ -3007,17 +3011,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
|
|||
|
||||
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
|
||||
|
||||
if (UseGPRIdxMode) {
|
||||
MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
|
||||
.addImm(0) // Reset inside loop.
|
||||
.addImm(VGPRIndexMode::SRC0_ENABLE);
|
||||
SetOn->getOperand(3).setIsUndef();
|
||||
|
||||
// Disable again after the loop.
|
||||
BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
|
||||
}
|
||||
|
||||
auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
|
||||
auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
|
||||
Offset, UseGPRIdxMode, true);
|
||||
MachineBasicBlock *LoopBB = InsPt->getParent();
|
||||
|
||||
if (UseGPRIdxMode) {
|
||||
|
@ -3025,6 +3020,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
|
|||
.addReg(SrcReg, RegState::Undef, SubReg)
|
||||
.addReg(SrcReg, RegState::Implicit)
|
||||
.addReg(AMDGPU::M0, RegState::Implicit);
|
||||
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
|
||||
} else {
|
||||
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
|
||||
.addReg(SrcReg, RegState::Undef, SubReg)
|
||||
|
@ -3125,22 +3121,10 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
|
|||
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
|
||||
if (UseGPRIdxMode) {
|
||||
MachineBasicBlock::iterator I(&MI);
|
||||
|
||||
MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
|
||||
.addImm(0) // Reset inside loop.
|
||||
.addImm(VGPRIndexMode::DST_ENABLE);
|
||||
SetOn->getOperand(3).setIsUndef();
|
||||
|
||||
// Disable again after the loop.
|
||||
BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
|
||||
}
|
||||
|
||||
unsigned PhiReg = MRI.createVirtualRegister(VecRC);
|
||||
|
||||
auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
|
||||
Offset, UseGPRIdxMode);
|
||||
Offset, UseGPRIdxMode, false);
|
||||
MachineBasicBlock *LoopBB = InsPt->getParent();
|
||||
|
||||
if (UseGPRIdxMode) {
|
||||
|
@ -3150,6 +3134,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
|
|||
.addReg(Dst, RegState::ImplicitDefine)
|
||||
.addReg(PhiReg, RegState::Implicit)
|
||||
.addReg(AMDGPU::M0, RegState::Implicit);
|
||||
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
|
||||
} else {
|
||||
const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
|
||||
|
||||
|
|
|
@ -118,23 +118,21 @@ entry:
|
|||
; The offset depends on the register that holds the first element of the vector.
|
||||
|
||||
; FIXME: The waitcnt for the argument load can go after the loop
|
||||
; IDXMODE: s_set_gpr_idx_on 0, src0
|
||||
; GCN: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
|
||||
; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
|
||||
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v{{[0-9]+}}
|
||||
; GCN: s_and_saveexec_b64 vcc, vcc
|
||||
|
||||
; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe0
|
||||
; MOVREL: s_and_saveexec_b64 vcc, vcc
|
||||
; MOVREL: v_movrels_b32_e32 [[RESULT:v[0-9]+]], v1
|
||||
|
||||
; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00
|
||||
; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
|
||||
; IDXMODE: s_and_saveexec_b64 vcc, vcc
|
||||
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], src0
|
||||
; IDXMODE: v_mov_b32_e32 [[RESULT:v[0-9]+]], v1
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN: s_cbranch_execnz
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
; GCN: buffer_store_dword [[RESULT]]
|
||||
define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
|
@ -250,21 +248,19 @@ entry:
|
|||
; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
|
||||
; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
|
||||
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
|
||||
; GCN: s_and_saveexec_b64 vcc, vcc
|
||||
|
||||
; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00
|
||||
; MOVREL: s_and_saveexec_b64 vcc, vcc
|
||||
; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 5
|
||||
|
||||
; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
|
||||
; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
|
||||
; IDXMODE: s_and_saveexec_b64 vcc, vcc
|
||||
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
|
||||
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 5
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN: s_cbranch_execnz [[LOOPBB]]
|
||||
; GCN: s_mov_b64 exec, [[SAVEEXEC]]
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN: buffer_store_dword
|
||||
define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
|
||||
entry:
|
||||
|
@ -283,8 +279,6 @@ entry:
|
|||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}}
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_on 0, dst
|
||||
|
||||
; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
|
||||
|
||||
; The offset depends on the register that holds the first element of the vector.
|
||||
|
@ -294,12 +288,11 @@ entry:
|
|||
; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], [[VAL]]
|
||||
|
||||
; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[READLANE]], -16
|
||||
; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
|
||||
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
|
||||
; IDXMODE: v_mov_b32_e32 [[VEC_ELT0]], [[VAL]]
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN: s_cbranch_execnz
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
|
||||
entry:
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
@ -322,52 +315,46 @@ entry:
|
|||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_on 0, src0
|
||||
|
||||
; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
|
||||
|
||||
; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
|
||||
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
|
||||
; GCN: s_and_saveexec_b64 vcc, vcc
|
||||
|
||||
; MOVREL: s_mov_b32 m0, [[READLANE]]
|
||||
; MOVREL: s_and_saveexec_b64 vcc, vcc
|
||||
; MOVREL: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
|
||||
; IDXMODE: s_and_saveexec_b64 vcc, vcc
|
||||
; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0
|
||||
; IDXMODE: v_mov_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, vcc
|
||||
; GCN-NEXT: s_cbranch_execnz [[LOOP0]]
|
||||
|
||||
; FIXME: Redundant copy
|
||||
; GCN: s_mov_b64 exec, [[MASK]]
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN: v_mov_b32_e32 [[VEC_ELT1_2:v[0-9]+]], [[S_ELT1]]
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_on 0, src0
|
||||
; GCN: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec
|
||||
|
||||
; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
|
||||
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
|
||||
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
|
||||
; GCN: s_and_saveexec_b64 vcc, vcc
|
||||
|
||||
; MOVREL: s_mov_b32 m0, [[READLANE]]
|
||||
; MOVREL: s_and_saveexec_b64 vcc, vcc
|
||||
; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
|
||||
; IDXMODE: s_and_saveexec_b64 vcc, vcc
|
||||
; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0
|
||||
; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, vcc
|
||||
; GCN: s_cbranch_execnz [[LOOP1]]
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN: buffer_store_dword [[MOVREL0]]
|
||||
; GCN: buffer_store_dword [[MOVREL1]]
|
||||
define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
|
||||
|
@ -403,42 +390,38 @@ bb2:
|
|||
; GCN: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
|
||||
; GCN: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_on 0, dst
|
||||
|
||||
; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
|
||||
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
|
||||
; GCN: s_and_saveexec_b64 vcc, vcc
|
||||
|
||||
; MOVREL: s_mov_b32 m0, [[READLANE]]
|
||||
; MOVREL: s_and_saveexec_b64 vcc, vcc
|
||||
; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]]
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
|
||||
; IDXMODE: s_and_saveexec_b64 vcc, vcc
|
||||
; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
|
||||
; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]]
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, vcc
|
||||
; GCN: s_cbranch_execnz [[LOOP0]]
|
||||
|
||||
; FIXME: Redundant copy
|
||||
; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]]
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_on 0, dst
|
||||
; GCN: s_mov_b64 [[MASK]], exec
|
||||
|
||||
; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
|
||||
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
|
||||
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
|
||||
; GCN: s_and_saveexec_b64 vcc, vcc
|
||||
|
||||
; MOVREL: s_mov_b32 m0, [[READLANE]]
|
||||
; MOVREL: s_and_saveexec_b64 vcc, vcc
|
||||
; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
|
||||
; IDXMODE: s_and_saveexec_b64 vcc, vcc
|
||||
; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
|
||||
; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, vcc
|
||||
; GCN: s_cbranch_execnz [[LOOP1]]
|
||||
|
@ -639,7 +622,6 @@ define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)
|
|||
|
||||
; GCN: {{^BB[0-9]+_[0-9]+}}:
|
||||
; GCN: s_mov_b64 exec,
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN: [[BB2]]:
|
||||
; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
|
||||
|
@ -648,8 +630,10 @@ define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)
|
|||
; GCN: [[REGLOOP:BB[0-9]+_[0-9]+]]:
|
||||
; MOVREL: v_movreld_b32_e32
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_idx
|
||||
; IDXMODE: s_set_gpr_idx_on
|
||||
; IDXMODE: v_mov_b32_e32
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN: s_cbranch_execnz [[REGLOOP]]
|
||||
define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
|
||||
bb:
|
||||
|
|
Loading…
Reference in New Issue