[SimplifyCFG] Allow hoisting terminators only with HoistCommonInsts=false.

As a side-effect of the change to default HoistCommonInsts to false
early in the pipeline, we fail to convert conditional branch & phis to
selects early on, which prevents vectorization for loops that contain
conditional branches that effectively are selects (or if the loop gets
vectorized, it will get vectorized very inefficiently).

This patch updates SimplifyCFG to perform hoisting if the only
instruction in both BBs is an equal branch. In this case, the only
additional instructions are selects for phis, which should be cheap.

Even though we perform hoisting, the benefits of this kind of hoisting
should by far outweigh the negatives.

For example, the loop in the code below will not get vectorized on
AArch64 with the current default, but will with the patch. This is a
fundamental pattern we should definitely vectorize. Besides that, I
think the select variants should be easier to use for reasoning across
other passes as well.

https://clang.godbolt.org/z/sbjd8Wshx

```
double clamp(double v) {
  if (v < 0.0)
    return 0.0;
  if (v > 6.0)
    return 6.0;
  return v;
}

void loop(double* X, double *Y) {
  for (unsigned i = 0; i < 20000; i++) {
    X[i] = clamp(Y[i]);
  }
}
```

Reviewed By: lebedev.ri

Differential Revision: https://reviews.llvm.org/D100329
This commit is contained in:
Florian Hahn 2021-04-13 08:24:24 +01:00
parent b757bc14e6
commit 467b1f1cd2
No known key found for this signature in database
GPG Key ID: 61D7554B5CECDC0D
3 changed files with 65 additions and 30 deletions

View File

@ -253,7 +253,8 @@ class SimplifyCFGOpt {
bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
IRBuilder<> &Builder);
bool HoistThenElseCodeToIf(BranchInst *BI, const TargetTransformInfo &TTI);
bool HoistThenElseCodeToIf(BranchInst *BI, const TargetTransformInfo &TTI,
bool EqTermsOnly);
bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
const TargetTransformInfo &TTI);
bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
@ -1388,9 +1389,12 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValu
/// Given a conditional branch that goes to BB1 and BB2, hoist any common code
/// in the two blocks up into the branch block. The caller of this function
/// guarantees that BI's block dominates BB1 and BB2.
/// guarantees that BI's block dominates BB1 and BB2. If EqTermsOnly is given,
/// only perform hoisting in case both blocks only contain a terminator. In that
/// case, only the original BI will be replaced and selects for PHIs are added.
bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
const TargetTransformInfo &TTI) {
const TargetTransformInfo &TTI,
bool EqTermsOnly) {
// This does very trivial matching, with limited scanning, to find identical
// instructions in the two blocks. In particular, we don't want to get into
// O(M*N) situations here where M and N are the sizes of BB1 and BB2. As
@ -1427,6 +1431,16 @@ bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
++NumHoistCommonCode;
});
// Check if only hoisting terminators is allowed. This does not add new
// instructions to the hoist location.
if (EqTermsOnly) {
if (!I1->isIdenticalToWhenDefined(I2))
return false;
if (!I1->isTerminator())
return false;
goto HoistTerminator;
}
do {
// If we are hoisting the terminator instruction, don't move one (making a
// broken BB), instead clone it, and remove BI.
@ -6443,9 +6457,9 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
// can hoist it up to the branching block.
if (BI->getSuccessor(0)->getSinglePredecessor()) {
if (BI->getSuccessor(1)->getSinglePredecessor()) {
if (HoistCommon && Options.HoistCommonInsts)
if (HoistThenElseCodeToIf(BI, TTI))
return requestResimplify();
if (HoistCommon &&
HoistThenElseCodeToIf(BI, TTI, !Options.HoistCommonInsts))
return requestResimplify();
} else {
// If Successor #1 has multiple preds, we may be able to conditionally
// execute Successor #0 if it branches to Successor #1.

View File

@ -41,28 +41,54 @@ return: ; preds = %if.end3, %if.then2,
define void @loop(double* %X, double* %Y) {
; CHECK-LABEL: @loop(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[X:%.*]], i64 20000
; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr double, double* [[Y:%.*]], i64 20000
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP9]], [[X]]
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[Y]]
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[INDEX]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8, !alias.scope !0
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2
; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8, !alias.scope !0
; CHECK-NEXT: [[TMP5:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD11]], zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], <double 6.000000e+00, double 6.000000e+00>
; CHECK-NEXT: [[TMP8:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD11]], <double 6.000000e+00, double 6.000000e+00>
; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP8]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD11]]
; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP5]], <2 x double> zeroinitializer, <2 x double> [[TMP9]]
; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP6]], <2 x double> zeroinitializer, <2 x double> [[TMP10]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <2 x double>*
; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP14]], align 8, !alias.scope !3, !noalias !0
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[TMP13]], i64 2
; CHECK-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to <2 x double>*
; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP16]], align 8, !alias.scope !3, !noalias !0
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20000
; CHECK-NEXT: br i1 [[TMP17]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
; CHECK-NEXT: [[I_05:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[CLAMP_EXIT:%.*]] ]
; CHECK-NEXT: [[I_05:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_05]] to i64
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[IDXPROM]]
; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX]], align 8
; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt double [[TMP0]], 0.000000e+00
; CHECK-NEXT: br i1 [[CMP_I]], label [[CLAMP_EXIT]], label [[IF_END_I:%.*]]
; CHECK: if.end.i:
; CHECK-NEXT: [[CMP1_I:%.*]] = fcmp ogt double [[TMP0]], 6.000000e+00
; CHECK-NEXT: br i1 [[CMP1_I]], label [[CLAMP_EXIT]], label [[IF_END3_I:%.*]]
; CHECK: if.end3.i:
; CHECK-NEXT: br label [[CLAMP_EXIT]]
; CHECK: clamp.exit:
; CHECK-NEXT: [[RETVAL_0_I:%.*]] = phi double [ [[TMP0]], [[IF_END3_I]] ], [ 0.000000e+00, [[FOR_BODY]] ], [ 6.000000e+00, [[IF_END_I]] ]
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[IDXPROM]]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IDXPROM]]
; CHECK-NEXT: [[TMP18:%.*]] = load double, double* [[ARRAYIDX]], align 8
; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt double [[TMP18]], 0.000000e+00
; CHECK-NEXT: [[CMP1_I:%.*]] = fcmp ogt double [[TMP18]], 6.000000e+00
; CHECK-NEXT: [[DOTV_I:%.*]] = select i1 [[CMP1_I]], double 6.000000e+00, double [[TMP18]]
; CHECK-NEXT: [[RETVAL_0_I:%.*]] = select i1 [[CMP_I]], double 0.000000e+00, double [[DOTV_I]]
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IDXPROM]]
; CHECK-NEXT: store double [[RETVAL_0_I]], double* [[ARRAYIDX2]], align 8
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_05]], 1
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I_05]], 19999
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP7:![0-9]+]]
;
entry:
%X.addr = alloca double*, align 8

View File

@ -110,8 +110,8 @@ return:
}
; A example where only the branch instructions from %if.then2 and %if.end3 need
; to be hoisted, which effectively replaces the original branch in %if.end. As
; this does not add any new instructions to the hoist location, it should always be profitable.
; to be hoisted, which effectively replaces the original branch in %if.end and
; only requires selects for PHIs in the successor.
define float @clamp_float_value(float %value, float %minimum_value, float %maximum_value) {
; HOIST-LABEL: @clamp_float_value(
; HOIST-NEXT: entry:
@ -124,14 +124,9 @@ define float @clamp_float_value(float %value, float %minimum_value, float %maxim
; NOHOIST-LABEL: @clamp_float_value(
; NOHOIST-NEXT: entry:
; NOHOIST-NEXT: [[CMP:%.*]] = fcmp ogt float [[VALUE:%.*]], [[MAXIMUM_VALUE:%.*]]
; NOHOIST-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END:%.*]]
; NOHOIST: if.end:
; NOHOIST-NEXT: [[CMP1:%.*]] = fcmp olt float [[VALUE]], [[MINIMUM_VALUE:%.*]]
; NOHOIST-NEXT: br i1 [[CMP1]], label [[RETURN]], label [[IF_END3:%.*]]
; NOHOIST: if.end3:
; NOHOIST-NEXT: br label [[RETURN]]
; NOHOIST: return:
; NOHOIST-NEXT: [[RETVAL_0:%.*]] = phi float [ [[VALUE]], [[IF_END3]] ], [ [[MAXIMUM_VALUE]], [[ENTRY:%.*]] ], [ [[MINIMUM_VALUE]], [[IF_END]] ]
; NOHOIST-NEXT: [[MINIMUM_VALUE_VALUE:%.*]] = select i1 [[CMP1]], float [[MINIMUM_VALUE]], float [[VALUE]]
; NOHOIST-NEXT: [[RETVAL_0:%.*]] = select i1 [[CMP]], float [[MAXIMUM_VALUE]], float [[MINIMUM_VALUE_VALUE]]
; NOHOIST-NEXT: ret float [[RETVAL_0]]
;
entry: