forked from OSchip/llvm-project
[SimplifyCFG] Allow hoisting terminators only with HoistCommonInsts=false.
As a side-effect of the change to default HoistCommonInsts to false early in the pipeline, we fail to convert conditional branch & phis to selects early on, which prevents vectorization for loops that contain conditional branches that effectively are selects (or if the loop gets vectorized, it will get vectorized very inefficiently). This patch updates SimplifyCFG to perform hoisting if the only instruction in both BBs is an equal branch. In this case, the only additional instructions are selects for phis, which should be cheap. Even though we perform hoisting, the benefits of this kind of hoisting should by far outweigh the negatives. For example, the loop in the code below will not get vectorized on AArch64 with the current default, but will with the patch. This is a fundamental pattern we should definitely vectorize. Besides that, I think the select variants should be easier to use for reasoning across other passes as well. https://clang.godbolt.org/z/sbjd8Wshx ``` double clamp(double v) { if (v < 0.0) return 0.0; if (v > 6.0) return 6.0; return v; } void loop(double* X, double *Y) { for (unsigned i = 0; i < 20000; i++) { X[i] = clamp(Y[i]); } } ``` Reviewed By: lebedev.ri Differential Revision: https://reviews.llvm.org/D100329
This commit is contained in:
parent
b757bc14e6
commit
467b1f1cd2
|
@ -253,7 +253,8 @@ class SimplifyCFGOpt {
|
|||
bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
|
||||
IRBuilder<> &Builder);
|
||||
|
||||
bool HoistThenElseCodeToIf(BranchInst *BI, const TargetTransformInfo &TTI);
|
||||
bool HoistThenElseCodeToIf(BranchInst *BI, const TargetTransformInfo &TTI,
|
||||
bool EqTermsOnly);
|
||||
bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
|
||||
const TargetTransformInfo &TTI);
|
||||
bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
|
||||
|
@ -1388,9 +1389,12 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValu
|
|||
|
||||
/// Given a conditional branch that goes to BB1 and BB2, hoist any common code
|
||||
/// in the two blocks up into the branch block. The caller of this function
|
||||
/// guarantees that BI's block dominates BB1 and BB2.
|
||||
/// guarantees that BI's block dominates BB1 and BB2. If EqTermsOnly is given,
|
||||
/// only perform hoisting in case both blocks only contain a terminator. In that
|
||||
/// case, only the original BI will be replaced and selects for PHIs are added.
|
||||
bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
|
||||
const TargetTransformInfo &TTI) {
|
||||
const TargetTransformInfo &TTI,
|
||||
bool EqTermsOnly) {
|
||||
// This does very trivial matching, with limited scanning, to find identical
|
||||
// instructions in the two blocks. In particular, we don't want to get into
|
||||
// O(M*N) situations here where M and N are the sizes of BB1 and BB2. As
|
||||
|
@ -1427,6 +1431,16 @@ bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
|
|||
++NumHoistCommonCode;
|
||||
});
|
||||
|
||||
// Check if only hoisting terminators is allowed. This does not add new
|
||||
// instructions to the hoist location.
|
||||
if (EqTermsOnly) {
|
||||
if (!I1->isIdenticalToWhenDefined(I2))
|
||||
return false;
|
||||
if (!I1->isTerminator())
|
||||
return false;
|
||||
goto HoistTerminator;
|
||||
}
|
||||
|
||||
do {
|
||||
// If we are hoisting the terminator instruction, don't move one (making a
|
||||
// broken BB), instead clone it, and remove BI.
|
||||
|
@ -6443,9 +6457,9 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
|
|||
// can hoist it up to the branching block.
|
||||
if (BI->getSuccessor(0)->getSinglePredecessor()) {
|
||||
if (BI->getSuccessor(1)->getSinglePredecessor()) {
|
||||
if (HoistCommon && Options.HoistCommonInsts)
|
||||
if (HoistThenElseCodeToIf(BI, TTI))
|
||||
return requestResimplify();
|
||||
if (HoistCommon &&
|
||||
HoistThenElseCodeToIf(BI, TTI, !Options.HoistCommonInsts))
|
||||
return requestResimplify();
|
||||
} else {
|
||||
// If Successor #1 has multiple preds, we may be able to conditionally
|
||||
// execute Successor #0 if it branches to Successor #1.
|
||||
|
|
|
@ -41,28 +41,54 @@ return: ; preds = %if.end3, %if.then2,
|
|||
define void @loop(double* %X, double* %Y) {
|
||||
; CHECK-LABEL: @loop(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
||||
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[X:%.*]], i64 20000
|
||||
; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr double, double* [[Y:%.*]], i64 20000
|
||||
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP9]], [[X]]
|
||||
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[Y]]
|
||||
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
|
||||
; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[INDEX]] to i64
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP0]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
|
||||
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8, !alias.scope !0
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
|
||||
; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8, !alias.scope !0
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD11]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], <double 6.000000e+00, double 6.000000e+00>
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD11]], <double 6.000000e+00, double 6.000000e+00>
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP8]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD11]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP5]], <2 x double> zeroinitializer, <2 x double> [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP6]], <2 x double> zeroinitializer, <2 x double> [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP0]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP14]], align 8, !alias.scope !3, !noalias !0
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[TMP13]], i64 2
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP16]], align 8, !alias.scope !3, !noalias !0
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20000
|
||||
; CHECK-NEXT: br i1 [[TMP17]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
|
||||
; CHECK: for.cond.cleanup:
|
||||
; CHECK-NEXT: ret void
|
||||
; CHECK: for.body:
|
||||
; CHECK-NEXT: [[I_05:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[CLAMP_EXIT:%.*]] ]
|
||||
; CHECK-NEXT: [[I_05:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
||||
; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_05]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[IDXPROM]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX]], align 8
|
||||
; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt double [[TMP0]], 0.000000e+00
|
||||
; CHECK-NEXT: br i1 [[CMP_I]], label [[CLAMP_EXIT]], label [[IF_END_I:%.*]]
|
||||
; CHECK: if.end.i:
|
||||
; CHECK-NEXT: [[CMP1_I:%.*]] = fcmp ogt double [[TMP0]], 6.000000e+00
|
||||
; CHECK-NEXT: br i1 [[CMP1_I]], label [[CLAMP_EXIT]], label [[IF_END3_I:%.*]]
|
||||
; CHECK: if.end3.i:
|
||||
; CHECK-NEXT: br label [[CLAMP_EXIT]]
|
||||
; CHECK: clamp.exit:
|
||||
; CHECK-NEXT: [[RETVAL_0_I:%.*]] = phi double [ [[TMP0]], [[IF_END3_I]] ], [ 0.000000e+00, [[FOR_BODY]] ], [ 6.000000e+00, [[IF_END_I]] ]
|
||||
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[IDXPROM]]
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IDXPROM]]
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = load double, double* [[ARRAYIDX]], align 8
|
||||
; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt double [[TMP18]], 0.000000e+00
|
||||
; CHECK-NEXT: [[CMP1_I:%.*]] = fcmp ogt double [[TMP18]], 6.000000e+00
|
||||
; CHECK-NEXT: [[DOTV_I:%.*]] = select i1 [[CMP1_I]], double 6.000000e+00, double [[TMP18]]
|
||||
; CHECK-NEXT: [[RETVAL_0_I:%.*]] = select i1 [[CMP_I]], double 0.000000e+00, double [[DOTV_I]]
|
||||
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IDXPROM]]
|
||||
; CHECK-NEXT: store double [[RETVAL_0_I]], double* [[ARRAYIDX2]], align 8
|
||||
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_05]], 1
|
||||
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I_05]], 19999
|
||||
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
|
||||
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP7:![0-9]+]]
|
||||
;
|
||||
entry:
|
||||
%X.addr = alloca double*, align 8
|
||||
|
|
|
@ -110,8 +110,8 @@ return:
|
|||
}
|
||||
|
||||
; A example where only the branch instructions from %if.then2 and %if.end3 need
|
||||
; to be hoisted, which effectively replaces the original branch in %if.end. As
|
||||
; this does not add any new instructions to the hoist location, it should always be profitable.
|
||||
; to be hoisted, which effectively replaces the original branch in %if.end and
|
||||
; only requires selects for PHIs in the successor.
|
||||
define float @clamp_float_value(float %value, float %minimum_value, float %maximum_value) {
|
||||
; HOIST-LABEL: @clamp_float_value(
|
||||
; HOIST-NEXT: entry:
|
||||
|
@ -124,14 +124,9 @@ define float @clamp_float_value(float %value, float %minimum_value, float %maxim
|
|||
; NOHOIST-LABEL: @clamp_float_value(
|
||||
; NOHOIST-NEXT: entry:
|
||||
; NOHOIST-NEXT: [[CMP:%.*]] = fcmp ogt float [[VALUE:%.*]], [[MAXIMUM_VALUE:%.*]]
|
||||
; NOHOIST-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END:%.*]]
|
||||
; NOHOIST: if.end:
|
||||
; NOHOIST-NEXT: [[CMP1:%.*]] = fcmp olt float [[VALUE]], [[MINIMUM_VALUE:%.*]]
|
||||
; NOHOIST-NEXT: br i1 [[CMP1]], label [[RETURN]], label [[IF_END3:%.*]]
|
||||
; NOHOIST: if.end3:
|
||||
; NOHOIST-NEXT: br label [[RETURN]]
|
||||
; NOHOIST: return:
|
||||
; NOHOIST-NEXT: [[RETVAL_0:%.*]] = phi float [ [[VALUE]], [[IF_END3]] ], [ [[MAXIMUM_VALUE]], [[ENTRY:%.*]] ], [ [[MINIMUM_VALUE]], [[IF_END]] ]
|
||||
; NOHOIST-NEXT: [[MINIMUM_VALUE_VALUE:%.*]] = select i1 [[CMP1]], float [[MINIMUM_VALUE]], float [[VALUE]]
|
||||
; NOHOIST-NEXT: [[RETVAL_0:%.*]] = select i1 [[CMP]], float [[MAXIMUM_VALUE]], float [[MINIMUM_VALUE_VALUE]]
|
||||
; NOHOIST-NEXT: ret float [[RETVAL_0]]
|
||||
;
|
||||
entry:
|
||||
|
|
Loading…
Reference in New Issue