forked from OSchip/llvm-project
AMDGPU: Allow some control flow intrinsics to be CSEd
These clean up some unnecessary or instructions in cases with complex loops. In the original testcase I noticed this, the same or with exec was repeated 5 or 6 times in a row. With this only one is emitted or sometimes a copy. llvm-svn: 281786
This commit is contained in:
parent
e1b7d2520d
commit
6408c9135c
|
@ -148,12 +148,15 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
|
|||
|
||||
Break = M.getOrInsertFunction(
|
||||
BreakIntrinsic, Int64, Int64, (Type *)nullptr);
|
||||
cast<Function>(Break)->setDoesNotAccessMemory();
|
||||
|
||||
IfBreak = M.getOrInsertFunction(
|
||||
IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
|
||||
cast<Function>(IfBreak)->setDoesNotAccessMemory();;
|
||||
|
||||
ElseBreak = M.getOrInsertFunction(
|
||||
ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
|
||||
cast<Function>(ElseBreak)->setDoesNotAccessMemory();
|
||||
|
||||
Loop = M.getOrInsertFunction(
|
||||
LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
|
||||
|
|
|
@ -1643,20 +1643,30 @@ SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
|
|||
}
|
||||
|
||||
bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
|
||||
if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
|
||||
return false;
|
||||
|
||||
switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
|
||||
default: return false;
|
||||
case AMDGPUIntrinsic::amdgcn_if:
|
||||
case AMDGPUIntrinsic::amdgcn_else:
|
||||
case AMDGPUIntrinsic::amdgcn_break:
|
||||
case AMDGPUIntrinsic::amdgcn_if_break:
|
||||
case AMDGPUIntrinsic::amdgcn_else_break:
|
||||
case AMDGPUIntrinsic::amdgcn_loop:
|
||||
case AMDGPUIntrinsic::amdgcn_end_cf:
|
||||
return true;
|
||||
if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
|
||||
switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
|
||||
case AMDGPUIntrinsic::amdgcn_if:
|
||||
case AMDGPUIntrinsic::amdgcn_else:
|
||||
case AMDGPUIntrinsic::amdgcn_end_cf:
|
||||
case AMDGPUIntrinsic::amdgcn_loop:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
|
||||
switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) {
|
||||
case AMDGPUIntrinsic::amdgcn_break:
|
||||
case AMDGPUIntrinsic::amdgcn_if_break:
|
||||
case AMDGPUIntrinsic::amdgcn_else_break:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void SITargetLowering::createDebuggerPrologueStackObjects(
|
||||
|
@ -1708,30 +1718,50 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
|
|||
Target = BR->getOperand(1);
|
||||
}
|
||||
|
||||
// FIXME: This changes the types of the intrinsics instead of introducing new
|
||||
// nodes with the correct types.
|
||||
// e.g. llvm.amdgcn.loop
|
||||
|
||||
// eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
|
||||
// => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
|
||||
|
||||
if (!isCFIntrinsic(Intr)) {
|
||||
// This is a uniform branch so we don't need to legalize.
|
||||
return BRCOND;
|
||||
}
|
||||
|
||||
bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
|
||||
Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
|
||||
|
||||
assert(!SetCC ||
|
||||
(SetCC->getConstantOperandVal(1) == 1 &&
|
||||
cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
|
||||
ISD::SETNE));
|
||||
|
||||
// Build the result and
|
||||
ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
|
||||
|
||||
// operands of the new intrinsic call
|
||||
SmallVector<SDValue, 4> Ops;
|
||||
Ops.push_back(BRCOND.getOperand(0));
|
||||
Ops.append(Intr->op_begin() + 1, Intr->op_end());
|
||||
if (HaveChain)
|
||||
Ops.push_back(BRCOND.getOperand(0));
|
||||
|
||||
Ops.append(Intr->op_begin() + (HaveChain ? 1 : 0), Intr->op_end());
|
||||
Ops.push_back(Target);
|
||||
|
||||
ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
|
||||
|
||||
// build the new intrinsic call
|
||||
SDNode *Result = DAG.getNode(
|
||||
Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
|
||||
DAG.getVTList(Res), Ops).getNode();
|
||||
|
||||
if (!HaveChain) {
|
||||
SDValue Ops[] = {
|
||||
SDValue(Result, 0),
|
||||
BRCOND.getOperand(0)
|
||||
};
|
||||
|
||||
Result = DAG.getMergeValues(Ops, DL).getNode();
|
||||
}
|
||||
|
||||
if (BR) {
|
||||
// Give the branch instruction our target
|
||||
SDValue Ops[] = {
|
||||
|
|
|
@ -128,6 +128,9 @@ class CFPseudoInstSI<dag outs, dag ins, list<dag> pattern = [],
|
|||
|
||||
let Uses = !if(UseExec, [EXEC], []);
|
||||
let Defs = !if(DefExec, [EXEC, SCC], [SCC]);
|
||||
let mayLoad = 0;
|
||||
let mayStore = 0;
|
||||
let hasSideEffects = 0;
|
||||
}
|
||||
|
||||
class Enc32 {
|
||||
|
|
|
@ -1024,8 +1024,6 @@ def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
|
|||
// SI pseudo instructions. These are used by the CFG structurizer pass
|
||||
// and should be lowered to ISA instructions prior to codegen.
|
||||
|
||||
let hasSideEffects = 1 in {
|
||||
|
||||
// Dummy terminator instruction to use after control flow instructions
|
||||
// replaced with exec mask operations.
|
||||
def SI_MASK_BRANCH : PseudoInstSI <
|
||||
|
@ -1044,12 +1042,18 @@ def SI_IF: CFPseudoInstSI <
|
|||
[(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> {
|
||||
let Constraints = "";
|
||||
let Size = 8;
|
||||
let mayStore = 1;
|
||||
let mayLoad = 1;
|
||||
let hasSideEffects = 1;
|
||||
}
|
||||
|
||||
def SI_ELSE : CFPseudoInstSI <
|
||||
(outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
|
||||
let Constraints = "$src = $dst";
|
||||
let Size = 12;
|
||||
let mayStore = 1;
|
||||
let mayLoad = 1;
|
||||
let hasSideEffects = 1;
|
||||
}
|
||||
|
||||
def SI_LOOP : CFPseudoInstSI <
|
||||
|
@ -1057,6 +1061,9 @@ def SI_LOOP : CFPseudoInstSI <
|
|||
[(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> {
|
||||
let Size = 8;
|
||||
let isBranch = 1;
|
||||
let hasSideEffects = 1;
|
||||
let mayLoad = 1;
|
||||
let mayStore = 1;
|
||||
}
|
||||
|
||||
} // End isBranch = 1, isTerminator = 1
|
||||
|
@ -1065,24 +1072,35 @@ def SI_END_CF : CFPseudoInstSI <
|
|||
(outs), (ins SReg_64:$saved),
|
||||
[(int_amdgcn_end_cf i64:$saved)], 1, 1> {
|
||||
let Size = 4;
|
||||
let isAsCheapAsAMove = 1;
|
||||
let isReMaterializable = 1;
|
||||
let mayLoad = 1;
|
||||
let mayStore = 1;
|
||||
let hasSideEffects = 1;
|
||||
}
|
||||
|
||||
def SI_BREAK : CFPseudoInstSI <
|
||||
(outs SReg_64:$dst), (ins SReg_64:$src),
|
||||
[(set i64:$dst, (int_amdgcn_break i64:$src))], 1> {
|
||||
let Size = 4;
|
||||
let isAsCheapAsAMove = 1;
|
||||
let isReMaterializable = 1;
|
||||
}
|
||||
|
||||
def SI_IF_BREAK : CFPseudoInstSI <
|
||||
(outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
|
||||
[(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
|
||||
let Size = 4;
|
||||
let isAsCheapAsAMove = 1;
|
||||
let isReMaterializable = 1;
|
||||
}
|
||||
|
||||
def SI_ELSE_BREAK : CFPseudoInstSI <
|
||||
(outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1),
|
||||
[(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> {
|
||||
let Size = 4;
|
||||
let isAsCheapAsAMove = 1;
|
||||
let isReMaterializable = 1;
|
||||
}
|
||||
|
||||
let Uses = [EXEC], Defs = [EXEC,VCC] in {
|
||||
|
@ -1100,7 +1118,6 @@ def SI_KILL_TERMINATOR : SPseudoInstSI <
|
|||
|
||||
} // End Uses = [EXEC], Defs = [EXEC,VCC]
|
||||
|
||||
} // End mayLoad = 1, mayStore = 1, hasSideEffects = 1
|
||||
|
||||
def SI_PS_LIVE : PseudoInstSI <
|
||||
(outs SReg_64:$dst), (ins),
|
||||
|
|
|
@ -186,11 +186,11 @@ let TargetPrefix = "amdgcn", isTarget = 1 in {
|
|||
|
||||
/* Control flow Intrinsics */
|
||||
|
||||
def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
|
||||
def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>;
|
||||
def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
|
||||
def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>;
|
||||
def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>;
|
||||
def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>;
|
||||
def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], []>;
|
||||
def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], [IntrConvergent]>;
|
||||
def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
|
||||
def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
|
||||
def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
|
||||
def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
|
||||
def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
|
||||
def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;
|
||||
}
|
||||
|
|
|
@ -1,22 +1,48 @@
|
|||
; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow < %s | FileCheck %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow < %s | FileCheck -check-prefix=OPT %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; CHECK-LABEL: {{^}}define amdgpu_vs void @main
|
||||
; CHECK: main_body:
|
||||
; CHECK: LOOP.outer:
|
||||
; CHECK: LOOP:
|
||||
; CHECK: [[if:%[0-9]+]] = call { i1, i64 } @llvm.amdgcn.if(
|
||||
; CHECK: [[if_exec:%[0-9]+]] = extractvalue { i1, i64 } [[if]], 1
|
||||
; OPT-LABEL: {{^}}define amdgpu_vs void @multi_else_break(
|
||||
; OPT: main_body:
|
||||
; OPT: LOOP.outer:
|
||||
; OPT: LOOP:
|
||||
; OPT: [[if:%[0-9]+]] = call { i1, i64 } @llvm.amdgcn.if(
|
||||
; OPT: [[if_exec:%[0-9]+]] = extractvalue { i1, i64 } [[if]], 1
|
||||
;
|
||||
; CHECK: Flow:
|
||||
; OPT: Flow:
|
||||
;
|
||||
; Ensure two else.break calls, for both the inner and outer loops
|
||||
|
||||
; OPT: call i64 @llvm.amdgcn.else.break(i64 [[if_exec]],
|
||||
; OPT-NEXT: call i64 @llvm.amdgcn.else.break(i64 [[if_exec]],
|
||||
; OPT-NEXT: call void @llvm.amdgcn.end.cf
|
||||
;
|
||||
; CHECK: call i64 @llvm.amdgcn.else.break(i64 [[if_exec]],
|
||||
; CHECK-NEXT: call i64 @llvm.amdgcn.else.break(i64 [[if_exec]],
|
||||
; CHECK-NEXT: call void @llvm.amdgcn.end.cf
|
||||
;
|
||||
; CHECK: Flow1:
|
||||
define amdgpu_vs void @main(<4 x float> %vec, i32 %ub, i32 %cont) {
|
||||
; OPT: Flow1:
|
||||
|
||||
; GCN-LABEL: {{^}}multi_else_break:
|
||||
|
||||
; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}
|
||||
|
||||
; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}
|
||||
; GCN: s_and_saveexec_b64 [[SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], vcc
|
||||
|
||||
; GCN: BB{{[0-9]+}}_{{[0-9]+}}: ; %Flow{{$}}
|
||||
; GCN-NEXT: ; in Loop: Header=[[INNER_LOOP]] Depth=2
|
||||
|
||||
; Ensure extra or eliminated
|
||||
; GCN-NEXT: s_or_b64 exec, exec, [[SAVE_BREAK]]
|
||||
; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
|
||||
; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]]
|
||||
; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]]
|
||||
|
||||
; GCN: ; BB#{{[0-9]+}}: ; %Flow1{{$}}
|
||||
; GCN-NEXT: ; in Loop: Header=[[OUTER_LOOP]] Depth=1
|
||||
|
||||
; Ensure copy is eliminated
|
||||
; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]]
|
||||
; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
|
||||
; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]]
|
||||
; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]]
|
||||
define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
|
||||
main_body:
|
||||
br label %LOOP.outer
|
||||
|
||||
|
@ -38,4 +64,52 @@ ENDIF: ; preds = %LOOP
|
|||
br i1 %tmp51, label %LOOP, label %LOOP.outer
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
; OPT-LABEL: define void @multi_if_break_loop(
|
||||
; OPT: llvm.amdgcn.break
|
||||
; OPT: llvm.amdgcn.loop
|
||||
; OPT: llvm.amdgcn.if.break
|
||||
; OPT: llvm.amdgcn.if.break
|
||||
; OPT: llvm.amdgcn.end.cf
|
||||
|
||||
; GCN-LABEL: {{^}}multi_if_break_loop:
|
||||
; GCN: s_mov_b64 [[BREAK_REG:s\[[0-9]+:[0-9]+\]]], 0{{$}}
|
||||
|
||||
; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}
|
||||
|
||||
; Uses a copy intsead of an or
|
||||
; GCN: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[BREAK_REG]]
|
||||
; GCN: s_or_b64 [[BREAK_REG]], exec, [[COPY]]
|
||||
define void @multi_if_break_loop(i32 %arg) #0 {
|
||||
bb:
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tmp = sub i32 %id, %arg
|
||||
br label %bb1
|
||||
|
||||
bb1:
|
||||
%lsr.iv = phi i32 [ undef, %bb ], [ %lsr.iv.next, %case0 ], [ %lsr.iv.next, %case1 ]
|
||||
%lsr.iv.next = add i32 %lsr.iv, 1
|
||||
%cmp0 = icmp slt i32 %lsr.iv.next, 0
|
||||
%load0 = load volatile i32, i32 addrspace(1)* undef, align 4
|
||||
switch i32 %load0, label %bb9 [
|
||||
i32 0, label %case0
|
||||
i32 1, label %case1
|
||||
]
|
||||
|
||||
case0:
|
||||
%load1 = load volatile i32, i32 addrspace(1)* undef, align 4
|
||||
%cmp1 = icmp slt i32 %tmp, %load1
|
||||
br i1 %cmp1, label %bb1, label %bb9
|
||||
|
||||
case1:
|
||||
%load2 = load volatile i32, i32 addrspace(1)* undef, align 4
|
||||
%cmp2 = icmp slt i32 %tmp, %load2
|
||||
br i1 %cmp2, label %bb1, label %bb9
|
||||
|
||||
bb9:
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
||||
|
|
Loading…
Reference in New Issue