forked from OSchip/llvm-project
[AMDGPU] Fix DPP combiner check for exec modification
Summary: r363675 changed the exec modification helper function, now called execMayBeModifiedBeforeUse, so that if no UseMI is specified it checks all instructions in the basic block, even beyond the last use. That meant that the DPP combiner no longer worked in any basic block that ended with a control flow instruction, and in particular it didn't work on code sequences generated by the atomic optimizer. Fix it by reinstating the old behaviour but in a new helper function execMayBeModifiedBeforeAnyUse, and limiting the number of instructions scanned. Reviewers: arsenm, vpykhtin Subscribers: kzhuravl, nemanjai, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kbarton, MaskRay, jfb, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64393 llvm-svn: 365910
This commit is contained in:
parent
f625a8a250
commit
27ec195f39
|
@ -344,7 +344,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
|
||||||
auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
|
auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
|
||||||
assert(DstOpnd && DstOpnd->isReg());
|
assert(DstOpnd && DstOpnd->isReg());
|
||||||
auto DPPMovReg = DstOpnd->getReg();
|
auto DPPMovReg = DstOpnd->getReg();
|
||||||
if (execMayBeModifiedBeforeUse(*MRI, DPPMovReg, MovMI)) {
|
if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
|
||||||
LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
|
LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
|
||||||
" for all uses\n");
|
" for all uses\n");
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -650,7 +650,7 @@ void SIFoldOperands::foldOperand(
|
||||||
if (execMayBeModifiedBeforeUse(*MRI,
|
if (execMayBeModifiedBeforeUse(*MRI,
|
||||||
UseMI->getOperand(UseOpIdx).getReg(),
|
UseMI->getOperand(UseOpIdx).getReg(),
|
||||||
*OpToFold.getParent(),
|
*OpToFold.getParent(),
|
||||||
UseMI))
|
*UseMI))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
|
UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
|
||||||
|
@ -669,7 +669,7 @@ void SIFoldOperands::foldOperand(
|
||||||
if (execMayBeModifiedBeforeUse(*MRI,
|
if (execMayBeModifiedBeforeUse(*MRI,
|
||||||
UseMI->getOperand(UseOpIdx).getReg(),
|
UseMI->getOperand(UseOpIdx).getReg(),
|
||||||
*OpToFold.getParent(),
|
*OpToFold.getParent(),
|
||||||
UseMI))
|
*UseMI))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// %vgpr = COPY %sgpr0
|
// %vgpr = COPY %sgpr0
|
||||||
|
|
|
@ -6269,42 +6269,29 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
|
bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
|
||||||
unsigned VReg,
|
Register VReg,
|
||||||
const MachineInstr &DefMI,
|
const MachineInstr &DefMI,
|
||||||
const MachineInstr *UseMI) {
|
const MachineInstr &UseMI) {
|
||||||
assert(MRI.isSSA() && "Must be run on SSA");
|
assert(MRI.isSSA() && "Must be run on SSA");
|
||||||
|
|
||||||
auto *TRI = MRI.getTargetRegisterInfo();
|
auto *TRI = MRI.getTargetRegisterInfo();
|
||||||
auto *DefBB = DefMI.getParent();
|
auto *DefBB = DefMI.getParent();
|
||||||
|
|
||||||
if (UseMI) {
|
// Don't bother searching between blocks, although it is possible this block
|
||||||
// Don't bother searching between blocks, although it is possible this block
|
// doesn't modify exec.
|
||||||
// doesn't modify exec.
|
if (UseMI.getParent() != DefBB)
|
||||||
if (UseMI->getParent() != DefBB)
|
return true;
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
int NumUse = 0;
|
|
||||||
const int MaxUseScan = 10;
|
|
||||||
|
|
||||||
for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) {
|
|
||||||
if (UseInst.getParent() != DefBB)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
if (NumUse++ > MaxUseScan)
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const int MaxInstScan = 20;
|
const int MaxInstScan = 20;
|
||||||
int NumScan = 0;
|
int NumInst = 0;
|
||||||
|
|
||||||
// Stop scan at the use if known.
|
// Stop scan at the use.
|
||||||
auto E = UseMI ? UseMI->getIterator() : DefBB->end();
|
auto E = UseMI.getIterator();
|
||||||
for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
|
for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
|
||||||
if (I->isDebugInstr())
|
if (I->isDebugInstr())
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (NumScan++ > MaxInstScan)
|
if (++NumInst > MaxInstScan)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
if (I->modifiesRegister(AMDGPU::EXEC, TRI))
|
if (I->modifiesRegister(AMDGPU::EXEC, TRI))
|
||||||
|
@ -6313,3 +6300,44 @@ bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
|
||||||
|
Register VReg,
|
||||||
|
const MachineInstr &DefMI) {
|
||||||
|
assert(MRI.isSSA() && "Must be run on SSA");
|
||||||
|
|
||||||
|
auto *TRI = MRI.getTargetRegisterInfo();
|
||||||
|
auto *DefBB = DefMI.getParent();
|
||||||
|
|
||||||
|
const int MaxUseInstScan = 10;
|
||||||
|
int NumUseInst = 0;
|
||||||
|
|
||||||
|
for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) {
|
||||||
|
// Don't bother searching between blocks, although it is possible this block
|
||||||
|
// doesn't modify exec.
|
||||||
|
if (UseInst.getParent() != DefBB)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (++NumUseInst > MaxUseInstScan)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int MaxInstScan = 20;
|
||||||
|
int NumInst = 0;
|
||||||
|
|
||||||
|
// Stop scan when we have seen all the uses.
|
||||||
|
for (auto I = std::next(DefMI.getIterator()); ; ++I) {
|
||||||
|
if (I->isDebugInstr())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (++NumInst > MaxInstScan)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (I->readsRegister(VReg))
|
||||||
|
if (--NumUseInst == 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (I->modifiesRegister(AMDGPU::EXEC, TRI))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -1016,13 +1016,19 @@ MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
|
||||||
MachineRegisterInfo &MRI);
|
MachineRegisterInfo &MRI);
|
||||||
|
|
||||||
/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
|
/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
|
||||||
/// DefMI and uses. If \p UseMI is not specified, this checks all uses of \p
|
/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
|
||||||
/// VReg. Should be run on SSA. Currently does not attempt to track between
|
/// attempt to track between blocks.
|
||||||
/// blocks.
|
|
||||||
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
|
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
|
||||||
unsigned VReg,
|
Register VReg,
|
||||||
const MachineInstr &DefMI,
|
const MachineInstr &DefMI,
|
||||||
const MachineInstr *UseMI = nullptr);
|
const MachineInstr &UseMI);
|
||||||
|
|
||||||
|
/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
|
||||||
|
/// DefMI and all its uses. Should be run on SSA. Currently does not attempt to
|
||||||
|
/// track between blocks.
|
||||||
|
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
|
||||||
|
Register VReg,
|
||||||
|
const MachineInstr &DefMI);
|
||||||
|
|
||||||
namespace AMDGPU {
|
namespace AMDGPU {
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
|
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
|
||||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
|
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
|
||||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
|
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
|
||||||
|
|
||||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
declare i32 @llvm.amdgcn.workitem.id.x()
|
||||||
|
|
||||||
|
@ -42,6 +42,8 @@ entry:
|
||||||
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
||||||
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
||||||
; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
|
; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
|
||||||
|
; GFX8MORE: v_add_u32_dpp
|
||||||
|
; GFX8MORE: v_add_u32_dpp
|
||||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
||||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||||
; GFX8MORE: buffer_atomic_add v[[value]]
|
; GFX8MORE: buffer_atomic_add v[[value]]
|
||||||
|
@ -133,6 +135,8 @@ entry:
|
||||||
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
||||||
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
||||||
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
|
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
|
||||||
|
; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
|
||||||
|
; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
|
||||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
||||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||||
; GFX8MORE: buffer_atomic_sub v[[value]]
|
; GFX8MORE: buffer_atomic_sub v[[value]]
|
||||||
|
|
|
@ -45,6 +45,8 @@ entry:
|
||||||
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
||||||
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
||||||
; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||||
|
; GFX8MORE: v_add_u32_dpp
|
||||||
|
; GFX8MORE: v_add_u32_dpp
|
||||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
||||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||||
; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
|
; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
|
||||||
|
@ -136,6 +138,8 @@ entry:
|
||||||
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
||||||
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
||||||
; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||||
|
; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
|
||||||
|
; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
|
||||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
||||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||||
; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
|
; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
|
||||||
|
|
|
@ -39,6 +39,8 @@ else:
|
||||||
; GFX8MORE: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
|
; GFX8MORE: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
|
||||||
; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
|
; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
|
||||||
; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
|
; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
|
||||||
|
; GFX8MORE: v_add_u32_dpp
|
||||||
|
; GFX8MORE: v_add_u32_dpp
|
||||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
||||||
; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
|
; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
|
||||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||||
|
|
|
@ -44,6 +44,8 @@ entry:
|
||||||
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
||||||
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
||||||
; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
|
; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
|
||||||
|
; GFX8MORE: v_add_u32_dpp
|
||||||
|
; GFX8MORE: v_add_u32_dpp
|
||||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
||||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||||
; GFX8MORE: buffer_atomic_add v[[value]]
|
; GFX8MORE: buffer_atomic_add v[[value]]
|
||||||
|
@ -104,6 +106,8 @@ entry:
|
||||||
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
||||||
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
||||||
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
|
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
|
||||||
|
; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
|
||||||
|
; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
|
||||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
||||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||||
; GFX8MORE: buffer_atomic_sub v[[value]]
|
; GFX8MORE: buffer_atomic_sub v[[value]]
|
||||||
|
|
|
@ -44,6 +44,8 @@ entry:
|
||||||
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
||||||
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
||||||
; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
|
; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
|
||||||
|
; GFX8MORE: v_add_u32_dpp
|
||||||
|
; GFX8MORE: v_add_u32_dpp
|
||||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
||||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||||
; GFX8MORE: buffer_atomic_add v[[value]]
|
; GFX8MORE: buffer_atomic_add v[[value]]
|
||||||
|
@ -117,6 +119,8 @@ entry:
|
||||||
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
||||||
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
||||||
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
|
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
|
||||||
|
; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
|
||||||
|
; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
|
||||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
||||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||||
; GFX8MORE: buffer_atomic_sub v[[value]]
|
; GFX8MORE: buffer_atomic_sub v[[value]]
|
||||||
|
|
|
@ -380,6 +380,38 @@ body: |
|
||||||
%9:vgpr_32 = V_SUB_I32_e32 5, %7, implicit-def $vcc, implicit $exec
|
%9:vgpr_32 = V_SUB_I32_e32 5, %7, implicit-def $vcc, implicit $exec
|
||||||
...
|
...
|
||||||
|
|
||||||
|
# tests on sequences of dpp consumers followed by control flow
|
||||||
|
# CHECK-LABEL: name: dpp_seq_cf
|
||||||
|
# CHECK: %4:vgpr_32 = V_ADD_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec
|
||||||
|
# CHECK: %5:vgpr_32 = V_SUBREV_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec
|
||||||
|
# CHECK: %6:vgpr_32 = V_OR_B32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
|
||||||
|
|
||||||
|
name: dpp_seq_cf
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
successors: %bb.1, %bb.2
|
||||||
|
liveins: $vgpr0, $vgpr1
|
||||||
|
%0:vgpr_32 = COPY $vgpr0
|
||||||
|
%1:vgpr_32 = COPY $vgpr1
|
||||||
|
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||||
|
|
||||||
|
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
|
||||||
|
%4:vgpr_32 = V_ADD_I32_e32 %3, %1, implicit-def $vcc, implicit $exec
|
||||||
|
%5:vgpr_32 = V_SUB_I32_e32 %1, %3, implicit-def $vcc, implicit $exec
|
||||||
|
%6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec
|
||||||
|
|
||||||
|
%7:sreg_64 = V_CMP_EQ_U32_e64 %5, %6, implicit $exec
|
||||||
|
%8:sreg_64 = SI_IF %7, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||||
|
S_BRANCH %bb.1
|
||||||
|
|
||||||
|
bb.1:
|
||||||
|
successors: %bb.2
|
||||||
|
|
||||||
|
bb.2:
|
||||||
|
SI_END_CF %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||||
|
...
|
||||||
|
|
||||||
# old reg def is in diff BB - cannot combine
|
# old reg def is in diff BB - cannot combine
|
||||||
# CHECK-LABEL: name: old_in_diff_bb
|
# CHECK-LABEL: name: old_in_diff_bb
|
||||||
# CHECK: %3:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 1, 1, 0, implicit $exec
|
# CHECK: %3:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 1, 1, 0, implicit $exec
|
||||||
|
|
Loading…
Reference in New Issue