forked from OSchip/llvm-project
[AMDGPU] Fix the SDWA Peephole phase to handle src for dst:UNUSED_PRESERVE.
Summary: The phase attempts to transform operations that extract a portion of a value into an SDWA src operand in cases where that value is used only once. It was not prepared for this use to be the preserved portion of a value for dst:UNUSED_PRESERVE, resulting in a crash or assert. This change either rejects the illegal SDWA attempt, or in the case where dst:WORD_1 and the src_sel would be WORD_0, removes the unneeded extract instruction. Reviewers: arsenm, #amdgpu Reviewed By: arsenm, #amdgpu Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D44364 llvm-svn: 328856
This commit is contained in:
parent
a7d614c3e5
commit
59e5ef793c
|
@ -366,18 +366,53 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
|
|||
bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
|
||||
// Find operand in instruction that matches source operand and replace it with
|
||||
// target operand. Set corresponding src_sel
|
||||
|
||||
bool IsPreserveSrc = false;
|
||||
MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
|
||||
MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
|
||||
MachineOperand *SrcMods =
|
||||
TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
|
||||
assert(Src && (Src->isReg() || Src->isImm()));
|
||||
if (!isSameReg(*Src, *getReplacedOperand())) {
|
||||
// If this is not src0 then it should be src1
|
||||
// If this is not src0 then it could be src1
|
||||
Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
|
||||
SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
|
||||
SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
|
||||
|
||||
if (!Src ||
|
||||
!isSameReg(*Src, *getReplacedOperand())) {
|
||||
// It's possible this Src is a tied operand for
|
||||
// UNUSED_PRESERVE, in which case we can either
|
||||
// abandon the peephole attempt, or if legal we can
|
||||
// copy the target operand into the tied slot
|
||||
// if the preserve operation will effectively cause the same
|
||||
// result by overwriting the rest of the dst.
|
||||
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
|
||||
MachineOperand *DstUnused =
|
||||
TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
|
||||
|
||||
if (Dst &&
|
||||
DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
|
||||
// This will work if the tied src is acessing WORD_0, and the dst is
|
||||
// writing WORD_1. Modifiers don't matter because all the bits that
|
||||
// would be impacted are being overwritten by the dst.
|
||||
// Any other case will not work.
|
||||
SdwaSel DstSel = static_cast<SdwaSel>(
|
||||
TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
|
||||
if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
|
||||
getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
|
||||
IsPreserveSrc = true;
|
||||
auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
|
||||
AMDGPU::OpName::vdst);
|
||||
auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
|
||||
Src = &MI.getOperand(TiedIdx);
|
||||
SrcSel = nullptr;
|
||||
SrcMods = nullptr;
|
||||
} else {
|
||||
// Not legal to convert this src
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
assert(Src && Src->isReg());
|
||||
|
||||
if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
|
||||
|
@ -388,11 +423,14 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
|
|||
return false;
|
||||
}
|
||||
|
||||
assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods);
|
||||
assert(isSameReg(*Src, *getReplacedOperand()) &&
|
||||
(IsPreserveSrc || (SrcSel && SrcMods)));
|
||||
}
|
||||
copyRegOperand(*Src, *getTargetOperand());
|
||||
SrcSel->setImm(getSrcSel());
|
||||
SrcMods->setImm(getSrcMods(TII, Src));
|
||||
if (!IsPreserveSrc) {
|
||||
SrcSel->setImm(getSrcSel());
|
||||
SrcMods->setImm(getSrcMods(TII, Src));
|
||||
}
|
||||
getTargetOperand()->setIsKill(false);
|
||||
return true;
|
||||
}
|
||||
|
@ -857,6 +895,9 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
|
|||
|
||||
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
|
||||
const SDWAOperandsVector &SDWAOperands) {
|
||||
|
||||
DEBUG(dbgs() << "Convert instruction:" << MI);
|
||||
|
||||
// Convert to sdwa
|
||||
int SDWAOpcode;
|
||||
unsigned Opcode = MI.getOpcode();
|
||||
|
@ -982,9 +1023,29 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
|
|||
}
|
||||
}
|
||||
|
||||
// Check for a preserved register that needs to be copied.
|
||||
auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
|
||||
if (DstUnused &&
|
||||
DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
|
||||
// We expect, if we are here, that the instruction was already in it's SDWA form,
|
||||
// with a tied operand.
|
||||
assert(Dst && Dst->isTied());
|
||||
assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
|
||||
// We also expect a vdst, since sdst can't preserve.
|
||||
auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
|
||||
assert(PreserveDstIdx != -1);
|
||||
|
||||
auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
|
||||
auto Tied = MI.getOperand(TiedIdx);
|
||||
|
||||
SDWAInst.add(Tied);
|
||||
SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
|
||||
}
|
||||
|
||||
// Apply all sdwa operand patterns.
|
||||
bool Converted = false;
|
||||
for (auto &Operand : SDWAOperands) {
|
||||
DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
|
||||
// There should be no intesection between SDWA operands and potential MIs
|
||||
// e.g.:
|
||||
// v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
|
||||
|
@ -1005,8 +1066,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
|
|||
return false;
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "Convert instruction:" << MI
|
||||
<< "Into:" << *SDWAInst << '\n');
|
||||
DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
|
||||
++NumSDWAInstructionsPeepholed;
|
||||
|
||||
MI.eraseFromParent();
|
||||
|
|
|
@ -54,3 +54,91 @@ body: |
|
|||
FLAT_STORE_DWORD %0, %13, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
|
||||
$sgpr30_sgpr31 = COPY %2
|
||||
S_SETPC_B64_return $sgpr30_sgpr31
|
||||
|
||||
---
|
||||
# SDWA-LABEL: sdwa_preserve_keep
|
||||
# SDWA: flat_load_dword [[FIRST:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]
|
||||
# SDWA: flat_load_dword [[SECOND:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]
|
||||
|
||||
# SDWA: v_and_b32_e32 [[AND:v[0-9]+]], 0xff, [[FIRST]]
|
||||
# SDWA: v_mov_b32_sdwa [[AND]], [[SECOND]] dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
|
||||
|
||||
# SDWA: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], [[AND]]
|
||||
|
||||
name: sdwa_preserve_keep
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: vreg_64 }
|
||||
- { id: 1, class: vreg_64 }
|
||||
- { id: 2, class: sreg_64 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: vgpr_32 }
|
||||
- { id: 5, class: sreg_32_xm0_xexec }
|
||||
- { id: 6, class: vgpr_32 }
|
||||
- { id: 7, class: vgpr_32 }
|
||||
- { id: 8, class: sreg_32_xm0 }
|
||||
- { id: 9, class: vgpr_32 }
|
||||
- { id: 10, class: sreg_32_xm0 }
|
||||
- { id: 11, class: vgpr_32 }
|
||||
- { id: 17, class: vgpr_32 }
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
|
||||
|
||||
%2 = COPY $sgpr30_sgpr31
|
||||
%1 = COPY $vgpr2_vgpr3
|
||||
%0 = COPY $vgpr0_vgpr1
|
||||
%3 = FLAT_LOAD_DWORD %0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
|
||||
%4 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
|
||||
|
||||
%9:vgpr_32 = V_LSHRREV_B16_e64 8, %3, implicit $exec
|
||||
%10:sreg_32_xm0 = S_MOV_B32 255
|
||||
%11:vgpr_32 = V_AND_B32_e64 %3, killed %10, implicit $exec
|
||||
%17:vgpr_32 = V_MOV_B32_sdwa 0, %4, 0, 5, 2, 4, implicit $exec, implicit %11(tied-def 0)
|
||||
FLAT_STORE_DWORD %0, %17, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
|
||||
S_ENDPGM
|
||||
|
||||
...
|
||||
---
|
||||
# SDWA-LABEL: sdwa_preserve_remove
|
||||
# SDWA: flat_load_dword [[FIRST:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]
|
||||
# SDWA: flat_load_dword [[SECOND:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]
|
||||
|
||||
# SDWA: v_mov_b32_sdwa [[FIRST]], [[SECOND]] dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
|
||||
|
||||
# SDWA: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], [[FIRST]]
|
||||
|
||||
name: sdwa_preserve_remove
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: vreg_64 }
|
||||
- { id: 1, class: vreg_64 }
|
||||
- { id: 2, class: sreg_64 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: vgpr_32 }
|
||||
- { id: 5, class: sreg_32_xm0_xexec }
|
||||
- { id: 6, class: vgpr_32 }
|
||||
- { id: 7, class: vgpr_32 }
|
||||
- { id: 8, class: sreg_32_xm0 }
|
||||
- { id: 9, class: vgpr_32 }
|
||||
- { id: 10, class: sreg_32_xm0 }
|
||||
- { id: 11, class: vgpr_32 }
|
||||
- { id: 17, class: vgpr_32 }
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
|
||||
|
||||
%2 = COPY $sgpr30_sgpr31
|
||||
%1 = COPY $vgpr2_vgpr3
|
||||
%0 = COPY $vgpr0_vgpr1
|
||||
%3 = FLAT_LOAD_DWORD %0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
|
||||
%4 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
|
||||
|
||||
%9:vgpr_32 = V_LSHRREV_B16_e64 8, %3, implicit $exec
|
||||
%10:sreg_32_xm0 = S_MOV_B32 65535
|
||||
%11:vgpr_32 = V_AND_B32_e64 %3, killed %10, implicit $exec
|
||||
%17:vgpr_32 = V_MOV_B32_sdwa 0, %4, 0, 5, 2, 4, implicit $exec, implicit %11(tied-def 0)
|
||||
FLAT_STORE_DWORD %0, %17, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
|
||||
S_ENDPGM
|
||||
|
||||
...
|
||||
|
|
Loading…
Reference in New Issue