diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index 7348b5b56c8b..e1845e2e8e87 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -344,7 +344,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); assert(DstOpnd && DstOpnd->isReg()); auto DPPMovReg = DstOpnd->getReg(); - if (execMayBeModifiedBeforeUse(*MRI, DPPMovReg, MovMI)) { + if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) { LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" " for all uses\n"); return false; diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index bcc3478c67b8..74d77d328019 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -650,7 +650,7 @@ void SIFoldOperands::foldOperand( if (execMayBeModifiedBeforeUse(*MRI, UseMI->getOperand(UseOpIdx).getReg(), *OpToFold.getParent(), - UseMI)) + *UseMI)) return; UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); @@ -669,7 +669,7 @@ void SIFoldOperands::foldOperand( if (execMayBeModifiedBeforeUse(*MRI, UseMI->getOperand(UseOpIdx).getReg(), *OpToFold.getParent(), - UseMI)) + *UseMI)) return; // %vgpr = COPY %sgpr0 diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d855f3f0e42b..34741850f82f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6269,42 +6269,29 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, } bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, - unsigned VReg, + Register VReg, const MachineInstr &DefMI, - const MachineInstr *UseMI) { + const MachineInstr &UseMI) { assert(MRI.isSSA() && "Must be run on SSA"); auto *TRI = MRI.getTargetRegisterInfo(); auto *DefBB = DefMI.getParent(); - if (UseMI) { - // Don't bother searching between blocks, although it is possible this block - // doesn't modify exec. - if (UseMI->getParent() != DefBB) - return true; - } else { - int NumUse = 0; - const int MaxUseScan = 10; - - for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) { - if (UseInst.getParent() != DefBB) - return true; - - if (NumUse++ > MaxUseScan) - return true; - } - } + // Don't bother searching between blocks, although it is possible this block + // doesn't modify exec. + if (UseMI.getParent() != DefBB) + return true; const int MaxInstScan = 20; - int NumScan = 0; + int NumInst = 0; - // Stop scan at the use if known. - auto E = UseMI ? UseMI->getIterator() : DefBB->end(); + // Stop scan at the use. + auto E = UseMI.getIterator(); for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { if (I->isDebugInstr()) continue; - if (NumScan++ > MaxInstScan) + if (++NumInst > MaxInstScan) return true; if (I->modifiesRegister(AMDGPU::EXEC, TRI)) @@ -6313,3 +6300,44 @@ bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, return false; } + +bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, + Register VReg, + const MachineInstr &DefMI) { + assert(MRI.isSSA() && "Must be run on SSA"); + + auto *TRI = MRI.getTargetRegisterInfo(); + auto *DefBB = DefMI.getParent(); + + const int MaxUseInstScan = 10; + int NumUseInst = 0; + + for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) { + // Don't bother searching between blocks, although it is possible this block + // doesn't modify exec. + if (UseInst.getParent() != DefBB) + return true; + + if (++NumUseInst > MaxUseInstScan) + return true; + } + + const int MaxInstScan = 20; + int NumInst = 0; + + // Stop scan when we have seen all the uses. + for (auto I = std::next(DefMI.getIterator()); ; ++I) { + if (I->isDebugInstr()) + continue; + + if (++NumInst > MaxInstScan) + return true; + + if (I->readsRegister(VReg)) + if (--NumUseInst == 0) + return false; + + if (I->modifiesRegister(AMDGPU::EXEC, TRI)) + return true; + } +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index b8275b622722..1f3c659f9d9c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1016,13 +1016,19 @@ MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI); /// \brief Return false if EXEC is not changed between the def of \p VReg at \p -/// DefMI and uses. If \p UseMI is not specified, this checks all uses of \p -/// VReg. Should be run on SSA. Currently does not attempt to track between -/// blocks. +/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not +/// attempt to track between blocks. bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, - unsigned VReg, + Register VReg, const MachineInstr &DefMI, - const MachineInstr *UseMI = nullptr); + const MachineInstr &UseMI); + +/// \brief Return false if EXEC is not changed between the def of \p VReg at \p +/// DefMI and all its uses. Should be run on SSA. Currently does not attempt to +/// track between blocks. +bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, + Register VReg, + const MachineInstr &DefMI); namespace AMDGPU { diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 969eb3cfa7a6..6f52a5f06a3f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -42,6 +42,8 @@ entry: ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}} +; GFX8MORE: v_add_u32_dpp +; GFX8MORE: v_add_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_add v[[value]] @@ -133,6 +135,8 @@ entry: ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 3ce91e83cf33..f3d50c9c490f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -45,6 +45,8 @@ entry: ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX8MORE: v_add_u32_dpp +; GFX8MORE: v_add_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] @@ -136,6 +138,8 @@ entry: ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index c1ce93242eee..a170f96e4fe4 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -39,6 +39,8 @@ else: ; GFX8MORE: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GFX8MORE: v_add_u32_dpp +; GFX8MORE: v_add_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index c2db5547201f..dd813df32811 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -44,6 +44,8 @@ entry: ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}} +; GFX8MORE: v_add_u32_dpp +; GFX8MORE: v_add_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_add v[[value]] @@ -104,6 +106,8 @@ entry: ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index eb3f0ab17ac8..c32a73ef2b89 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -44,6 +44,8 @@ entry: ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}} +; GFX8MORE: v_add_u32_dpp +; GFX8MORE: v_add_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_add v[[value]] @@ -117,6 +119,8 @@ entry: ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir index d98cde5cff4b..c43fb037d074 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir @@ -380,6 +380,38 @@ body: | %9:vgpr_32 = V_SUB_I32_e32 5, %7, implicit-def $vcc, implicit $exec ... +# tests on sequences of dpp consumers followed by control flow +# CHECK-LABEL: name: dpp_seq_cf +# CHECK: %4:vgpr_32 = V_ADD_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec +# CHECK: %5:vgpr_32 = V_SUBREV_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec +# CHECK: %6:vgpr_32 = V_OR_B32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec + +name: dpp_seq_cf +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $vgpr0, $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec + %4:vgpr_32 = V_ADD_I32_e32 %3, %1, implicit-def $vcc, implicit $exec + %5:vgpr_32 = V_SUB_I32_e32 %1, %3, implicit-def $vcc, implicit $exec + %6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec + + %7:sreg_64 = V_CMP_EQ_U32_e64 %5, %6, implicit $exec + %8:sreg_64 = SI_IF %7, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + + bb.2: + SI_END_CF %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec +... + # old reg def is in diff BB - cannot combine # CHECK-LABEL: name: old_in_diff_bb # CHECK: %3:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 1, 1, 0, implicit $exec