forked from OSchip/llvm-project
[AMDGPU] Fix vccz after v_readlane/v_readfirstlane to vcc_lo/hi
Summary: Up to gfx9, writes to vcc_lo and vcc_hi by instructions like v_readlane and v_readfirstlane do not update vccz to reflect the new value of vcc. Fix it by reusing part of the existing vccz bug handling code, which inserts an "s_mov_b64 vcc, vcc" instruction to restore vccz just before an instruction that needs the correct value. Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D69661
This commit is contained in:
parent
00efeae34f
commit
4a331beadc
|
@ -587,6 +587,11 @@ public:
|
|||
return getGeneration() <= SEA_ISLANDS;
|
||||
}
|
||||
|
||||
/// Writes to VCC_LO/VCC_HI update the VCCZ flag.
|
||||
bool partialVCCWritesUpdateVCCZ() const {
|
||||
return getGeneration() >= GFX10;
|
||||
}
|
||||
|
||||
/// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
|
||||
/// was written by a VALU instruction.
|
||||
bool hasSMRDReadVALUDefHazard() const {
|
||||
|
|
|
@ -1383,6 +1383,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
|
|||
ScoreBrackets.dump();
|
||||
});
|
||||
|
||||
// Assume VCCZ is correct at basic block boundaries, unless and until we need
|
||||
// to handle cases where that is not true.
|
||||
bool VCCZCorrect = true;
|
||||
|
||||
// Walk over the instructions.
|
||||
MachineInstr *OldWaitcntInstr = nullptr;
|
||||
|
||||
|
@ -1402,13 +1406,26 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
|
|||
continue;
|
||||
}
|
||||
|
||||
bool VCCZBugWorkAround = false;
|
||||
// We might need to restore vccz to its correct value for either of two
|
||||
// different reasons; see ST->hasReadVCCZBug() and
|
||||
// ST->partialVCCWritesUpdateVCCZ().
|
||||
bool RestoreVCCZ = false;
|
||||
if (readsVCCZ(Inst)) {
|
||||
if (ScoreBrackets.getScoreLB(LGKM_CNT) <
|
||||
ScoreBrackets.getScoreUB(LGKM_CNT) &&
|
||||
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
|
||||
if (ST->hasReadVCCZBug())
|
||||
VCCZBugWorkAround = true;
|
||||
if (!VCCZCorrect)
|
||||
RestoreVCCZ = true;
|
||||
else if (ST->hasReadVCCZBug()) {
|
||||
// There is a hardware bug on CI/SI where SMRD instruction may corrupt
|
||||
// vccz bit, so when we detect that an instruction may read from a
|
||||
// corrupt vccz bit, we need to:
|
||||
// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
|
||||
// operations to complete.
|
||||
// 2. Restore the correct value of vccz by writing the current value
|
||||
// of vcc back to vcc.
|
||||
if (ScoreBrackets.getScoreLB(LGKM_CNT) <
|
||||
ScoreBrackets.getScoreUB(LGKM_CNT) &&
|
||||
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
|
||||
RestoreVCCZ = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1419,6 +1436,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
|
|||
}
|
||||
}
|
||||
|
||||
if (!ST->partialVCCWritesUpdateVCCZ()) {
|
||||
// Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
|
||||
// Writes to vcc will fix it.
|
||||
if (Inst.definesRegister(AMDGPU::VCC_LO) ||
|
||||
Inst.definesRegister(AMDGPU::VCC_HI))
|
||||
VCCZCorrect = false;
|
||||
else if (Inst.definesRegister(AMDGPU::VCC))
|
||||
VCCZCorrect = true;
|
||||
}
|
||||
|
||||
// Generate an s_waitcnt instruction to be placed before
|
||||
// cur_Inst, if needed.
|
||||
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
|
||||
|
@ -1444,7 +1471,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
|
|||
|
||||
// TODO: Remove this work-around after fixing the scheduler and enable the
|
||||
// assert above.
|
||||
if (VCCZBugWorkAround) {
|
||||
if (RestoreVCCZ) {
|
||||
// Restore the vccz bit. Any time a value is written to vcc, the vcc
|
||||
// bit is updated, so we can restore the bit by reading the value of
|
||||
// vcc and then writing it back to the register.
|
||||
|
@ -1452,6 +1479,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
|
|||
TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
|
||||
TRI->getVCC())
|
||||
.addReg(TRI->getVCC());
|
||||
VCCZCorrect = true;
|
||||
Modified = true;
|
||||
}
|
||||
|
||||
|
|
|
@ -85,3 +85,81 @@ body: |
|
|||
S_ENDPGM 0
|
||||
|
||||
...
|
||||
---
|
||||
# Test that after reloading vcc spilled to a vgpr, we insert any necessary
|
||||
# instructions to fix vccz.
|
||||
|
||||
# CHECK-LABEL: name: reload_vcc_from_vgpr
|
||||
# CHECK: $vcc_lo = V_READLANE_B32_vi $vgpr0, 8, implicit-def $vcc
|
||||
# CHECK: $vcc_hi = V_READLANE_B32_vi $vgpr0, 9
|
||||
# SI: $vcc = S_MOV_B64 $vcc
|
||||
# GFX9: $vcc = S_MOV_B64 $vcc
|
||||
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
|
||||
|
||||
name: reload_vcc_from_vgpr
|
||||
body: |
|
||||
bb.0:
|
||||
$vcc_lo = V_READLANE_B32_vi $vgpr0, 8, implicit-def $vcc
|
||||
$vcc_hi = V_READLANE_B32_vi $vgpr0, 9
|
||||
S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
|
||||
bb.1:
|
||||
|
||||
...
|
||||
---
|
||||
# Test that after reloading vcc spilled to memory, we insert any necessary
|
||||
# instructions to fix vccz.
|
||||
|
||||
# CHECK-LABEL: name: reload_vcc_from_mem
|
||||
# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
|
||||
# CHECK: $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
|
||||
# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec
|
||||
# CHECK: $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
|
||||
# SI: $vcc = S_MOV_B64 $vcc
|
||||
# GFX9: $vcc = S_MOV_B64 $vcc
|
||||
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
|
||||
|
||||
name: reload_vcc_from_mem
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
|
||||
$vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
|
||||
$vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec
|
||||
$vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
|
||||
S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
|
||||
bb.1:
|
||||
|
||||
...
|
||||
---
|
||||
# Test that after inline asm that defines vcc_lo, we insert any necessary
|
||||
# instructions to fix vccz.
|
||||
|
||||
# CHECK-LABEL: name: inlineasm_def_vcc_lo
|
||||
# CHECK: INLINEASM &"; def vcc_lo", 1, 10, implicit-def $vcc_lo
|
||||
# SI: $vcc = S_MOV_B64 $vcc
|
||||
# GFX9: $vcc = S_MOV_B64 $vcc
|
||||
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
|
||||
|
||||
name: inlineasm_def_vcc_lo
|
||||
body: |
|
||||
bb.0:
|
||||
INLINEASM &"; def vcc_lo", 1, 10, implicit-def $vcc_lo
|
||||
S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
|
||||
bb.1:
|
||||
|
||||
...
|
||||
---
|
||||
# Test that after inline asm that defines vcc, no unnecessary instructions are
|
||||
# inserted to fix vccz.
|
||||
|
||||
# CHECK-LABEL: name: inlineasm_def_vcc
|
||||
# CHECK: INLINEASM &"; def vcc", 1, 10, implicit-def $vcc
|
||||
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
|
||||
|
||||
name: inlineasm_def_vcc
|
||||
body: |
|
||||
bb.0:
|
||||
INLINEASM &"; def vcc", 1, 10, implicit-def $vcc
|
||||
S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
|
||||
bb.1:
|
||||
|
||||
...
|
||||
|
|
Loading…
Reference in New Issue