forked from OSchip/llvm-project
AMDGPU/SI: Handle hazard with > 8 byte VMEM stores
Reviewers: arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, tony-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D25577 llvm-svn: 285359
This commit is contained in:
parent
139a58f75e
commit
b133fbb9a4
|
@ -556,6 +556,10 @@ public:
|
|||
return SGPRInitBug;
|
||||
}
|
||||
|
||||
bool has12DWordStoreHazard() const {
|
||||
return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
|
||||
}
|
||||
|
||||
unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const;
|
||||
|
||||
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
|
||||
|
|
|
@ -1164,6 +1164,7 @@ defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_si <0x5a>;
|
|||
defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_si <0x5b>;
|
||||
defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_si <0x5c>;
|
||||
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_si <0x5d>;
|
||||
// FIXME: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on CI.
|
||||
//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomic_si <0x5e">; // isn't on VI
|
||||
//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomic_si <0x5f>; // isn't on VI
|
||||
//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomic_si <0x60>; // isn't on VI
|
||||
|
|
|
@ -67,6 +67,9 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
|
|||
if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
|
||||
return NoopHazard;
|
||||
|
||||
if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
|
||||
return NoopHazard;
|
||||
|
||||
if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
|
||||
return NoopHazard;
|
||||
|
||||
|
@ -90,14 +93,20 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
|
|||
if (SIInstrInfo::isSMRD(*MI))
|
||||
return std::max(0, checkSMRDHazards(MI));
|
||||
|
||||
if (SIInstrInfo::isVALU(*MI)) {
|
||||
int WaitStates = std::max(0, checkVALUHazards(MI));
|
||||
|
||||
if (SIInstrInfo::isVMEM(*MI))
|
||||
return std::max(0, checkVMEMHazards(MI));
|
||||
WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
|
||||
|
||||
if (SIInstrInfo::isDPP(*MI))
|
||||
return std::max(0, checkDPPHazards(MI));
|
||||
WaitStates = std::max(WaitStates, checkDPPHazards(MI));
|
||||
|
||||
if (isDivFMas(MI->getOpcode()))
|
||||
return std::max(0, checkDivFMasHazards(MI));
|
||||
WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
|
||||
|
||||
return WaitStates;
|
||||
}
|
||||
|
||||
if (isSGetReg(MI->getOpcode()))
|
||||
return std::max(0, checkGetRegHazards(MI));
|
||||
|
@ -149,34 +158,40 @@ void GCNHazardRecognizer::RecedeCycle() {
|
|||
// Helper Functions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
int GCNHazardRecognizer::getWaitStatesSinceDef(
|
||||
unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
|
||||
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
|
||||
int WaitStates = -1;
|
||||
for (MachineInstr *MI : EmittedInstrs) {
|
||||
++WaitStates;
|
||||
if (!MI || !IsHazardDef(MI))
|
||||
continue;
|
||||
if (MI->modifiesRegister(Reg, TRI))
|
||||
return WaitStates;
|
||||
}
|
||||
return std::numeric_limits<int>::max();
|
||||
}
|
||||
|
||||
int GCNHazardRecognizer::getWaitStatesSinceSetReg(
|
||||
int GCNHazardRecognizer::getWaitStatesSince(
|
||||
function_ref<bool(MachineInstr *)> IsHazard) {
|
||||
|
||||
int WaitStates = -1;
|
||||
for (MachineInstr *MI : EmittedInstrs) {
|
||||
++WaitStates;
|
||||
if (!MI || !isSSetReg(MI->getOpcode()) || !IsHazard(MI))
|
||||
if (!MI || !IsHazard(MI))
|
||||
continue;
|
||||
return WaitStates;
|
||||
}
|
||||
return std::numeric_limits<int>::max();
|
||||
}
|
||||
|
||||
int GCNHazardRecognizer::getWaitStatesSinceDef(
|
||||
unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
|
||||
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
|
||||
auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
|
||||
return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
|
||||
};
|
||||
|
||||
return getWaitStatesSince(IsHazardFn);
|
||||
}
|
||||
|
||||
int GCNHazardRecognizer::getWaitStatesSinceSetReg(
|
||||
function_ref<bool(MachineInstr *)> IsHazard) {
|
||||
|
||||
auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
|
||||
return isSSetReg(MI->getOpcode()) && IsHazard(MI);
|
||||
};
|
||||
|
||||
return getWaitStatesSince(IsHazardFn);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// No-op Hazard Detection
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -350,3 +365,75 @@ int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
|
|||
int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
|
||||
return SetRegWaitStates - WaitStatesNeeded;
|
||||
}
|
||||
|
||||
int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
|
||||
if (!MI.mayStore())
|
||||
return -1;
|
||||
|
||||
const SIInstrInfo *TII = ST.getInstrInfo();
|
||||
unsigned Opcode = MI.getOpcode();
|
||||
const MCInstrDesc &Desc = MI.getDesc();
|
||||
|
||||
int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
|
||||
int VDataRCID = -1;
|
||||
if (VDataIdx != -1)
|
||||
VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
|
||||
|
||||
if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
|
||||
// For MUBUF/MTBUF instructions this hazard only exists if the
|
||||
// instruction is not using a register in the soffset field.
|
||||
const MachineOperand *SOffset =
|
||||
TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
|
||||
// If we have no soffset operand, then assume this field has been
|
||||
// hardcoded to zero.
|
||||
if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
|
||||
(!SOffset || !SOffset->isReg()))
|
||||
return VDataIdx;
|
||||
}
|
||||
|
||||
// MIMG instructions create a hazard if they don't use a 256-bit T# and
|
||||
// the store size is greater than 8 bytes and they have more than two bits
|
||||
// of their dmask set.
|
||||
// All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
|
||||
if (TII->isMIMG(MI)) {
|
||||
int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
|
||||
assert(SRsrcIdx != -1 &&
|
||||
AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
|
||||
}
|
||||
|
||||
if (TII->isFLAT(MI)) {
|
||||
int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::data);
|
||||
if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
|
||||
return DataIdx;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
|
||||
// This checks for the hazard where VMEM instructions that store more than
|
||||
// 8 bytes can have there store data over written by the next instruction.
|
||||
if (!ST.has12DWordStoreHazard())
|
||||
return 0;
|
||||
|
||||
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
const MachineRegisterInfo &MRI = VALU->getParent()->getParent()->getRegInfo();
|
||||
|
||||
const int VALUWaitStates = 1;
|
||||
int WaitStatesNeeded = 0;
|
||||
|
||||
for (const MachineOperand &Def : VALU->defs()) {
|
||||
if (!TRI->isVGPR(MRI, Def.getReg()))
|
||||
continue;
|
||||
unsigned Reg = Def.getReg();
|
||||
auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
|
||||
int DataIdx = createsVALUHazard(*MI);
|
||||
return DataIdx >= 0 &&
|
||||
TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
|
||||
};
|
||||
int WaitStatesNeededForDef =
|
||||
VALUWaitStates - getWaitStatesSince(IsHazardFn);
|
||||
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
|
||||
}
|
||||
return WaitStatesNeeded;
|
||||
}
|
||||
|
|
|
@ -35,6 +35,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
|
|||
const MachineFunction &MF;
|
||||
const SISubtarget &ST;
|
||||
|
||||
int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
|
||||
int getWaitStatesSinceDef(unsigned Reg,
|
||||
function_ref<bool(MachineInstr *)> IsHazardDef =
|
||||
[](MachineInstr *) { return true; });
|
||||
|
@ -47,6 +48,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
|
|||
int checkDivFMasHazards(MachineInstr *DivFMas);
|
||||
int checkGetRegHazards(MachineInstr *GetRegInstr);
|
||||
int checkSetRegHazards(MachineInstr *SetRegInstr);
|
||||
int createsVALUHazard(const MachineInstr &MI);
|
||||
int checkVALUHazards(MachineInstr *VALU);
|
||||
public:
|
||||
GCNHazardRecognizer(const MachineFunction &MF);
|
||||
// We can only issue one instruction per cycle.
|
||||
|
|
|
@ -352,8 +352,8 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
|
|||
|
||||
// Avoid using MCRegisterClass::getSize, since that function will go away
|
||||
// (move from MC* level to Target* level). Return size in bits.
|
||||
unsigned getRegBitWidth(const MCRegisterClass &RC) {
|
||||
switch (RC.getID()) {
|
||||
unsigned getRegBitWidth(unsigned RCID) {
|
||||
switch (RCID) {
|
||||
case AMDGPU::SGPR_32RegClassID:
|
||||
case AMDGPU::VGPR_32RegClassID:
|
||||
case AMDGPU::VS_32RegClassID:
|
||||
|
@ -382,6 +382,10 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) {
|
|||
}
|
||||
}
|
||||
|
||||
unsigned getRegBitWidth(const MCRegisterClass &RC) {
|
||||
return getRegBitWidth(RC.getID());
|
||||
}
|
||||
|
||||
unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
|
||||
unsigned OpNo) {
|
||||
unsigned RCID = Desc.OpInfo[OpNo].RegClass;
|
||||
|
|
|
@ -157,6 +157,9 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo);
|
|||
/// \brief Does this opearnd support only inlinable literals?
|
||||
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo);
|
||||
|
||||
/// \brief Get the size in bits of a register from the register class \p RC.
|
||||
unsigned getRegBitWidth(unsigned RCID);
|
||||
|
||||
/// \brief Get the size in bits of a register from the register class \p RC.
|
||||
unsigned getRegBitWidth(const MCRegisterClass &RC);
|
||||
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN
|
||||
# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN
|
||||
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,VI
|
||||
# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI
|
||||
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI
|
||||
|
||||
--- |
|
||||
define void @div_fmas() { ret void }
|
||||
define void @s_getreg() { ret void }
|
||||
define void @s_setreg() { ret void }
|
||||
define void @vmem_gt_8dw_store() { ret void }
|
||||
...
|
||||
---
|
||||
# GCN-LABEL: name: div_fmas
|
||||
|
@ -159,3 +160,77 @@ body: |
|
|||
S_SETREG_B32 %sgpr1, 0
|
||||
S_ENDPGM
|
||||
...
|
||||
|
||||
...
|
||||
---
|
||||
# GCN-LABEL: name: vmem_gt_8dw_store
|
||||
|
||||
# GCN-LABEL: bb.0:
|
||||
# GCN: BUFFER_STORE_DWORD_OFFSET
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
# GCN: BUFFER_STORE_DWORDX3_OFFSET
|
||||
# CIVI: S_NOP
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
# GCN: BUFFER_STORE_DWORDX4_OFFSET
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
# GCN: BUFFER_STORE_DWORDX4_OFFSET
|
||||
# CIVI: S_NOP
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
# GCN: BUFFER_STORE_FORMAT_XYZ_OFFSET
|
||||
# CIVI: S_NOP
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
# GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET
|
||||
# CIVI: S_NOP
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
|
||||
# GCN-LABEL: bb.1:
|
||||
# GCN: FLAT_STORE_DWORDX2
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
# GCN: FLAT_STORE_DWORDX3
|
||||
# CIVI: S_NOP
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
# GCN: FLAT_STORE_DWORDX4
|
||||
# CIVI: S_NOP
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
# GCN: FLAT_ATOMIC_CMPSWAP_X2
|
||||
# CIVI: S_NOP
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
# GCN: FLAT_ATOMIC_FCMPSWAP_X2
|
||||
# CIVI: S_NOP
|
||||
# GCN: V_MOV_B32
|
||||
|
||||
name: vmem_gt_8dw_store
|
||||
|
||||
body: |
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
BUFFER_STORE_DWORD_OFFSET %vgpr3, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
|
||||
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
|
||||
BUFFER_STORE_DWORDX3_OFFSET %vgpr2_vgpr3_vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
|
||||
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
|
||||
BUFFER_STORE_DWORDX4_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
|
||||
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
|
||||
BUFFER_STORE_DWORDX4_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
|
||||
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
|
||||
BUFFER_STORE_FORMAT_XYZ_OFFSET %vgpr2_vgpr3_vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
|
||||
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
|
||||
BUFFER_STORE_FORMAT_XYZW_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
|
||||
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
|
||||
BUFFER_ATOMIC_CMPSWAP_X2_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit %exec
|
||||
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
|
||||
S_BRANCH %bb.1
|
||||
|
||||
bb.1:
|
||||
FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
|
||||
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
|
||||
FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
|
||||
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
|
||||
FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
|
||||
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
|
||||
FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
|
||||
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
|
||||
FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
|
||||
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
|
||||
S_ENDPGM
|
||||
|
||||
...
|
||||
|
|
Loading…
Reference in New Issue