forked from OSchip/llvm-project
[AMDGPU] Fix DGEMM hazard for GFX90a
For VALU write and memory (VM, L/DS, FLAT) instructions, SQ would insert wait-states to avoid data hazard. However when there is a DGEMM instruction in-between them, SQ incorrectly disables the wait-states thus the data hazard needs to be handled with this workaround. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D130677
This commit is contained in:
parent
9bab358e39
commit
7fc52d7c8b
|
@ -2268,12 +2268,14 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
|
|||
if (SIInstrInfo::isMFMA(*MI))
|
||||
return 0;
|
||||
|
||||
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
|
||||
int WaitStatesNeeded = 0;
|
||||
|
||||
bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
|
||||
SIInstrInfo::isFLAT(*MI) ||
|
||||
SIInstrInfo::isDS(*MI) ||
|
||||
SIInstrInfo::isEXP(*MI);
|
||||
bool IsMem = SIInstrInfo::isVMEM(*MI) ||
|
||||
SIInstrInfo::isFLAT(*MI) ||
|
||||
SIInstrInfo::isDS(*MI);
|
||||
bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
|
||||
bool IsVALU = SIInstrInfo::isVALU(*MI);
|
||||
|
||||
const MachineInstr *MFMA = nullptr;
|
||||
|
@ -2295,6 +2297,20 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
|
|||
return true;
|
||||
};
|
||||
|
||||
bool DGEMMAfterVALUWrite = false;
|
||||
auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
|
||||
// Found DGEMM on reverse traversal to def.
|
||||
if (isDGEMM(MI.getOpcode()))
|
||||
DGEMMAfterVALUWrite = true;
|
||||
|
||||
// Only hazard if register is defined by a VALU and a DGEMM is found after
|
||||
// after the def.
|
||||
if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
|
||||
AMDGPU::OpName::src2);
|
||||
|
||||
|
@ -2316,6 +2332,7 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
|
|||
const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
|
||||
const int DotWriteSameDotReadSrcAB = 3;
|
||||
const int DotWriteDifferentVALURead = 3;
|
||||
const int DMFMABetweenVALUWriteVMEMRead = 2;
|
||||
const int MaxWaitStates = 19;
|
||||
|
||||
for (const MachineOperand &Use : MI->explicit_uses()) {
|
||||
|
@ -2339,6 +2356,22 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
|
|||
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
|
||||
}
|
||||
|
||||
// Workaround for HW data hazard bug observed only in GFX90A. When there
|
||||
// is a DGEMM instruction in-between a VALU and a VMEM instruction it
|
||||
// causes the SQ to incorrectly not insert two wait states between the two
|
||||
// instructions needed to avoid data hazard.
|
||||
if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
|
||||
DGEMMAfterVALUWrite = false;
|
||||
if (TRI.isVectorRegister(MRI, Reg)) {
|
||||
int WaitStatesNeededForUse =
|
||||
DMFMABetweenVALUWriteVMEMRead -
|
||||
getWaitStatesSinceDef(Reg, IsDGEMMHazard,
|
||||
DMFMABetweenVALUWriteVMEMRead);
|
||||
|
||||
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
|
||||
}
|
||||
}
|
||||
|
||||
MFMA = nullptr;
|
||||
WaitStatesSinceDef =
|
||||
getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
|
||||
|
|
|
@ -1308,3 +1308,178 @@ body: |
|
|||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr4_vgpr5 = V_FMAC_F64_e32 $vgpr4_vgpr5, $vgpr4_vgpr5, $vgpr4_vgpr5, implicit $mode, implicit $exec
|
||||
...
|
||||
# GCN-LABEL: name: dgemm_between_valu_write_buffer_store
|
||||
# GCN: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MFMA
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: BUFFER_STORE_DWORD
|
||||
name: dgemm_between_valu_write_buffer_store
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
|
||||
...
|
||||
# GCN-LABEL: name: dgemm_between_valu_write_buffer_load
|
||||
# GCN: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MFMA
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: BUFFER_LOAD_DWORD
|
||||
name: dgemm_between_valu_write_buffer_load
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr1 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec
|
||||
...
|
||||
# GCN-LABEL: name: dgemm_between_valu_write_global_store
|
||||
# GCN: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MFMA
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: GLOBAL_STORE_DWORD
|
||||
|
||||
name: dgemm_between_valu_write_global_store
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
|
||||
...
|
||||
# GCN-LABEL: name: dgemm_between_valu_write_global_load
|
||||
# GCN: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MFMA
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: GLOBAL_LOAD_DWORD
|
||||
name: dgemm_between_valu_write_global_load
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
|
||||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
...
|
||||
# GCN-LABEL: name: dgemm_between_valu_write_ds_write
|
||||
# GCN: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MFMA
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: DS_WRITE_B32
|
||||
name: dgemm_between_valu_write_ds_write
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
DS_WRITE_B32 $vgpr1, $vgpr0, 0, 0, implicit $m0, implicit $mode, implicit $exec
|
||||
...
|
||||
# GCN-LABEL: name: dgemm_between_valu_write_ds_read
|
||||
# GCN: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MFMA
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: DS_READ_B32_gfx9
|
||||
name: dgemm_between_valu_write_ds_read
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr1 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
|
||||
...
|
||||
# GCN-LABEL: name: dgemm_between_valu_write_flat_store
|
||||
# GCN: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MFMA
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: dgemm_between_valu_write_flat_store
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
|
||||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
FLAT_STORE_DWORD $vgpr0_vgpr1, $agpr2, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
|
||||
...
|
||||
# GCN-LABEL: name: dgemm_between_valu_write_flat_load
|
||||
# GCN: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MFMA
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: FLAT_LOAD_DWORD
|
||||
name: dgemm_between_valu_write_flat_load
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
|
||||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
...
|
||||
# GCN-LABEL: name: dgemm_between_valu_write_scratch_store
|
||||
# GCN: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MFMA
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: SCRATCH_STORE_DWORD
|
||||
name: dgemm_between_valu_write_scratch_store
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
SCRATCH_STORE_DWORD $vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
...
|
||||
# GCN-LABEL: name: dgemm_between_valu_write_scratch_load
|
||||
# GCN: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MFMA
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: SCRATCH_LOAD_DWORD
|
||||
name: dgemm_between_valu_write_scratch_load
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr1 = SCRATCH_LOAD_DWORD undef $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
...
|
||||
# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_fallthrough1
|
||||
# GCN: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MFMA
|
||||
# GCN: bb.1:
|
||||
# GCN-NEXT: S_NOP
|
||||
# GCN-NEXT: BUFFER_STORE_DWORD
|
||||
name: dgemm_between_valu_write_buffer_store_fallthrough1
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
|
||||
bb.1:
|
||||
BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
|
||||
...
|
||||
# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_fallthrough2
|
||||
# GCN: V_MOV_B32_e32
|
||||
# GCN: bb.1:
|
||||
# GCN-NEXT: V_MFMA
|
||||
# GCN-NEXT: S_NOP
|
||||
# GCN-NEXT: BUFFER_STORE_DWORD
|
||||
name: dgemm_between_valu_write_buffer_store_fallthrough2
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
|
||||
bb.1:
|
||||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
|
||||
...
|
||||
# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_fallthrough3
|
||||
# GCN: V_MOV_B32_e32
|
||||
# GCN: bb.1:
|
||||
# GCN: bb.2:
|
||||
# GCN-NEXT: V_MFMA
|
||||
# GCN-NEXT: S_NOP
|
||||
# GCN-NEXT: BUFFER_STORE_DWORD
|
||||
name: dgemm_between_valu_write_buffer_store_fallthrough3
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
|
||||
bb.1:
|
||||
|
||||
bb.2:
|
||||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
|
|
@ -2016,3 +2016,15 @@ body: |
|
|||
$vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_MFMA_F32_32X32X1F32_vgprcd_e64 $agpr26, $agpr28, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
|
||||
...
|
||||
# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_no_snop
|
||||
# GCN: V_MOV_B32_e32
|
||||
# GCN-NEXT: V_MFMA_F64
|
||||
# GCN-NOT: S_NOP
|
||||
# GCN-NEXT: BUFFER_STORE_DWORD
|
||||
name: dgemm_between_valu_write_buffer_store_no_snop
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
|
||||
BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
|
Loading…
Reference in New Issue