forked from OSchip/llvm-project
[MachineCSE] Prevent CSE of non-local convergent instrs
At the moment, MachineCSE allows CSE-ing convergent instrs which are non-local to each other. This can cause illegal codegen as convergent instrs are control flow dependent. The patch prevents non-local CSE of convergent instrs by adding a check in isProfitableToCSE and rejecting CSE-ing if we're considering CSE-ing non-local convergent instrs. We can still CSE convergent instrs which are in the same control flow scope, so the patch purposely does not make all convergent instrs non-CSE candidates in isCSECandidate. https://reviews.llvm.org/D101187
This commit is contained in:
parent
fc88d927e3
commit
59f2dd5f1a
|
@ -433,6 +433,11 @@ bool MachineCSE::isProfitableToCSE(Register CSReg, Register Reg,
|
|||
MachineBasicBlock *CSBB, MachineInstr *MI) {
|
||||
// FIXME: Heuristics that works around the lack the live range splitting.
|
||||
|
||||
MachineBasicBlock *BB = MI->getParent();
|
||||
// Prevent CSE-ing non-local convergent instructions.
|
||||
if (MI->isConvergent() && CSBB != BB)
|
||||
return false;
|
||||
|
||||
// If CSReg is used at all uses of Reg, CSE should not increase register
|
||||
// pressure of CSReg.
|
||||
bool MayIncreasePressure = true;
|
||||
|
@ -455,7 +460,6 @@ bool MachineCSE::isProfitableToCSE(Register CSReg, Register Reg,
|
|||
// an immediate predecessor. We don't want to increase register pressure and
|
||||
// end up causing other computation to be spilled.
|
||||
if (TII->isAsCheapAsAMove(*MI)) {
|
||||
MachineBasicBlock *BB = MI->getParent();
|
||||
if (CSBB != BB && !CSBB->isSuccessor(BB))
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -o - -run-pass=machine-cse %s | FileCheck %s
|
||||
|
||||
# Check that we don't CSE non-local convergent instrs. Otherwise, reusing defs
|
||||
# of convergent instrs from different control flow scopes can cause illegal
|
||||
# codegen. Previously, the swizzle in bb2 would be CSE-ed in favor of using the
|
||||
# swizzle in bb1 despite bb2 being a different control flow scope.
|
||||
|
||||
# CHECK-LABEL: name: no_cse
|
||||
# CHECK: bb.1.if.then
|
||||
# CHECK: [[SWIZZLE1:%[0-9]+]]:vgpr_32 = DS_SWIZZLE_B32 [[SRC:%[0-9]+]], 100, 0, implicit $exec
|
||||
# CHECK-NEXT: V_ADD_CO_U32_e64 [[SWIZZLE1]], {{%[0-9]+}}, 0, implicit $exec
|
||||
# CHECK-NEXT: S_CMP_LT_I32 {{.*}} implicit-def $scc
|
||||
# CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc
|
||||
# CHECK-NEXT: S_BRANCH %bb.2
|
||||
# CHECK: bb.2.if.then.if.then
|
||||
# CHECK: [[SWIZZLE2:%[0-9]+]]:vgpr_32 = DS_SWIZZLE_B32 [[SRC]], 100, 0, implicit $exec
|
||||
# CHECK-NEXT: V_ADD_CO_U32_e64 [[SWIZZLE2]], {{%[0-9]+}}, 0, implicit $exec
|
||||
|
||||
--- |
|
||||
define amdgpu_kernel void @no_cse(i32 addrspace(1)*, i32, i1) {
|
||||
entry:
|
||||
unreachable
|
||||
if.then:
|
||||
unreachable
|
||||
if.then.if.then:
|
||||
unreachable
|
||||
if.then.phi:
|
||||
unreachable
|
||||
exit:
|
||||
unreachable
|
||||
}
|
||||
...
|
||||
---
|
||||
name: no_cse
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0.entry:
|
||||
liveins: $sgpr4_sgpr5
|
||||
%0:sgpr_64(p4) = COPY $sgpr4_sgpr5
|
||||
%1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0(p4), 0, 0
|
||||
%2:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0(p4), 2, 0
|
||||
%3:sreg_64 = COPY %1
|
||||
%4:sreg_32 = COPY %2.sub1
|
||||
%5:sreg_32 = S_MOV_B32 42
|
||||
S_CMP_EQ_U32 %4, %5, implicit-def $scc
|
||||
%6:vgpr_32 = COPY %5, implicit $exec
|
||||
S_CBRANCH_SCC1 %bb.4, implicit $scc
|
||||
S_BRANCH %bb.1
|
||||
|
||||
bb.1.if.then:
|
||||
%7:sreg_32 = COPY %2.sub0
|
||||
%8:vgpr_32 = COPY %7
|
||||
%9:vgpr_32 = DS_SWIZZLE_B32 %8, 100, 0, implicit $exec
|
||||
%10:vgpr_32, %21:sreg_32 = V_ADD_CO_U32_e64 %9, %5, 0, implicit $exec
|
||||
S_CMP_LT_I32 %7, %5, implicit-def $scc
|
||||
S_CBRANCH_SCC1 %bb.3, implicit $scc
|
||||
S_BRANCH %bb.2
|
||||
|
||||
bb.2.if.then.if.then:
|
||||
%11:sreg_32 = S_MOV_B32 64
|
||||
%12:vgpr_32 = DS_SWIZZLE_B32 %8, 100, 0, implicit $exec
|
||||
%13:vgpr_32, %24:sreg_32 = V_ADD_CO_U32_e64 %12, %11, 0, implicit $exec
|
||||
|
||||
bb.3.if.then.phi:
|
||||
%14:vgpr_32 = PHI %10, %bb.1, %13, %bb.2
|
||||
|
||||
bb.4.exit:
|
||||
%15:vgpr_32 = PHI %6, %bb.0, %14, %bb.3
|
||||
%16:vreg_64 = COPY %3
|
||||
FLAT_STORE_DWORD %16, %15, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
|
||||
...
|
Loading…
Reference in New Issue