forked from OSchip/llvm-project
[Unroll] Do NOT unroll a loop with small runtime upperbound
For a runtime loop if we can compute its trip count upperbound: Don't unroll if: 1. loop is not guaranteed to run either zero or upperbound iterations; and 2. trip count upperbound is less than UnrollMaxUpperBound Unless user or TTI asked to do so. If unrolling, limit unroll factor to loop's trip count upperbound. Differential Revision: https://reviews.llvm.org/D62989 Change-Id: I6083c46a9d98b2e22cd855e60523fdc5a4929c73 llvm-svn: 373017
This commit is contained in:
parent
7dfb095b88
commit
1128fa0924
|
@ -114,8 +114,8 @@ bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
|
|||
DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
|
||||
const SmallPtrSetImpl<const Value *> &EphValues,
|
||||
OptimizationRemarkEmitter *ORE, unsigned &TripCount,
|
||||
unsigned MaxTripCount, unsigned &TripMultiple,
|
||||
unsigned LoopSize,
|
||||
unsigned MaxTripCount, bool MaxOrZero,
|
||||
unsigned &TripMultiple, unsigned LoopSize,
|
||||
TargetTransformInfo::UnrollingPreferences &UP,
|
||||
bool &UseUpperBound);
|
||||
|
||||
|
|
|
@ -166,7 +166,7 @@ static bool computeUnrollAndJamCount(
|
|||
bool UseUpperBound = false;
|
||||
bool ExplicitUnroll = computeUnrollCount(
|
||||
L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
|
||||
OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
|
||||
/*MaxOrZero*/ false, OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
|
||||
if (ExplicitUnroll || UseUpperBound) {
|
||||
// If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
|
||||
// for the unroller instead.
|
||||
|
|
|
@ -737,7 +737,7 @@ bool llvm::computeUnrollCount(
|
|||
Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
|
||||
ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
|
||||
OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
|
||||
unsigned &TripMultiple, unsigned LoopSize,
|
||||
bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize,
|
||||
TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
|
||||
|
||||
// Check for explicit Count.
|
||||
|
@ -788,18 +788,34 @@ bool llvm::computeUnrollCount(
|
|||
// Also we need to check if we exceed FullUnrollMaxCount.
|
||||
// If using the upper bound to unroll, TripMultiple should be set to 1 because
|
||||
// we do not know when loop may exit.
|
||||
// MaxTripCount and ExactTripCount cannot both be non zero since we only
|
||||
|
||||
// We can unroll by the upper bound amount if it's generally allowed or if
|
||||
// we know that the loop is executed either the upper bound or zero times.
|
||||
// (MaxOrZero unrolling keeps only the first loop test, so the number of
|
||||
// loop tests remains the same compared to the non-unrolled version, whereas
|
||||
// the generic upper bound unrolling keeps all but the last loop test so the
|
||||
// number of loop tests goes up which may end up being worse on targets with
|
||||
// constrained branch predictor resources so is controlled by an option.)
|
||||
// In addition we only unroll small upper bounds.
|
||||
unsigned FullUnrollMaxTripCount = MaxTripCount;
|
||||
if (!(UP.UpperBound || MaxOrZero) ||
|
||||
FullUnrollMaxTripCount > UnrollMaxUpperBound)
|
||||
FullUnrollMaxTripCount = 0;
|
||||
|
||||
// UnrollByMaxCount and ExactTripCount cannot both be non zero since we only
|
||||
// compute the former when the latter is zero.
|
||||
unsigned ExactTripCount = TripCount;
|
||||
assert((ExactTripCount == 0 || MaxTripCount == 0) &&
|
||||
"ExtractTripCount and MaxTripCount cannot both be non zero.");
|
||||
unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
|
||||
assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) &&
|
||||
"ExtractTripCount and UnrollByMaxCount cannot both be non zero.");
|
||||
|
||||
unsigned FullUnrollTripCount =
|
||||
ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
|
||||
UP.Count = FullUnrollTripCount;
|
||||
if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
|
||||
// When computing the unrolled size, note that BEInsns are not replicated
|
||||
// like the rest of the loop body.
|
||||
if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
|
||||
UseUpperBound = (MaxTripCount == FullUnrollTripCount);
|
||||
UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
|
||||
TripCount = FullUnrollTripCount;
|
||||
TripMultiple = UP.UpperBound ? 1 : TripMultiple;
|
||||
return ExplicitUnroll;
|
||||
|
@ -813,7 +829,7 @@ bool llvm::computeUnrollCount(
|
|||
unsigned Boost =
|
||||
getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
|
||||
if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
|
||||
UseUpperBound = (MaxTripCount == FullUnrollTripCount);
|
||||
UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
|
||||
TripCount = FullUnrollTripCount;
|
||||
TripMultiple = UP.UpperBound ? 1 : TripMultiple;
|
||||
return ExplicitUnroll;
|
||||
|
@ -889,6 +905,8 @@ bool llvm::computeUnrollCount(
|
|||
"because "
|
||||
"unrolled size is too large.";
|
||||
});
|
||||
LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count
|
||||
<< "\n");
|
||||
return ExplicitUnroll;
|
||||
}
|
||||
assert(TripCount == 0 &&
|
||||
|
@ -910,6 +928,12 @@ bool llvm::computeUnrollCount(
|
|||
return false;
|
||||
}
|
||||
|
||||
// Don't unroll a small upper bound loop unless user or TTI asked to do so.
|
||||
if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
|
||||
UP.Count = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the runtime trip count is too small when profile is available.
|
||||
if (L->getHeader()->getParent()->hasProfileData()) {
|
||||
if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
|
||||
|
@ -973,7 +997,11 @@ bool llvm::computeUnrollCount(
|
|||
|
||||
if (UP.Count > UP.MaxCount)
|
||||
UP.Count = UP.MaxCount;
|
||||
LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count
|
||||
|
||||
if (MaxTripCount && UP.Count > MaxTripCount)
|
||||
UP.Count = MaxTripCount;
|
||||
|
||||
LLVM_DEBUG(dbgs() << " runtime unrolling with count: " << UP.Count
|
||||
<< "\n");
|
||||
if (UP.Count < 2)
|
||||
UP.Count = 0;
|
||||
|
@ -1049,7 +1077,6 @@ static LoopUnrollResult tryToUnrollLoop(
|
|||
|
||||
// Find trip count and trip multiple if count is not available
|
||||
unsigned TripCount = 0;
|
||||
unsigned MaxTripCount = 0;
|
||||
unsigned TripMultiple = 1;
|
||||
// If there are multiple exiting blocks but one of them is the latch, use the
|
||||
// latch for the trip count estimation. Otherwise insist on a single exiting
|
||||
|
@ -1079,28 +1106,18 @@ static LoopUnrollResult tryToUnrollLoop(
|
|||
|
||||
// Try to find the trip count upper bound if we cannot find the exact trip
|
||||
// count.
|
||||
unsigned MaxTripCount = 0;
|
||||
bool MaxOrZero = false;
|
||||
if (!TripCount) {
|
||||
MaxTripCount = SE.getSmallConstantMaxTripCount(L);
|
||||
MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
|
||||
// We can unroll by the upper bound amount if it's generally allowed or if
|
||||
// we know that the loop is executed either the upper bound or zero times.
|
||||
// (MaxOrZero unrolling keeps only the first loop test, so the number of
|
||||
// loop tests remains the same compared to the non-unrolled version, whereas
|
||||
// the generic upper bound unrolling keeps all but the last loop test so the
|
||||
// number of loop tests goes up which may end up being worse on targets with
|
||||
// constrained branch predictor resources so is controlled by an option.)
|
||||
// In addition we only unroll small upper bounds.
|
||||
if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) {
|
||||
MaxTripCount = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// computeUnrollCount() decides whether it is beneficial to use upper bound to
|
||||
// fully unroll the loop.
|
||||
bool UseUpperBound = false;
|
||||
bool IsCountSetExplicitly = computeUnrollCount(
|
||||
L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount,
|
||||
L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
|
||||
TripMultiple, LoopSize, UP, UseUpperBound);
|
||||
if (!UP.Count)
|
||||
return LoopUnrollResult::Unmodified;
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
; RUN: opt -S -loop-unroll -unroll-runtime %s -o - | FileCheck %s
|
||||
; RUN: opt -S -loop-unroll -unroll-runtime -unroll-max-upperbound=6 %s -o - | FileCheck %s --check-prefix=UPPER
|
||||
|
||||
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
|
||||
|
||||
@global = dso_local local_unnamed_addr global i32 0, align 4
|
||||
@global.1 = dso_local local_unnamed_addr global i8* null, align 4
|
||||
|
||||
; Check that loop in hoge_3, with a runtime upperbound of 3, is not unrolled.
|
||||
; CHECK-LABEL: hoge_3
|
||||
; CHECK: loop:
|
||||
; CHECK: store
|
||||
; CHECK-NOT: store
|
||||
; CHECK: br i1 %{{.*}}, label %loop
|
||||
; UPPER-LABEL: hoge_3
|
||||
; UPPER: loop:
|
||||
; UPPER: store
|
||||
; UPPER-NOT: store
|
||||
; UPPER: br i1 %{{.*}}, label %loop
|
||||
define dso_local void @hoge_3(i8 %arg) {
|
||||
entry:
|
||||
%x = load i32, i32* @global, align 4
|
||||
%y = load i8*, i8** @global.1, align 4
|
||||
%0 = icmp ult i32 %x, 17
|
||||
br i1 %0, label %loop, label %exit
|
||||
|
||||
loop:
|
||||
%iv = phi i32 [ %x, %entry ], [ %iv.next, %loop ]
|
||||
%ptr = phi i8* [ %y, %entry ], [ %ptr.next, %loop ]
|
||||
%iv.next = add nuw i32 %iv, 8
|
||||
%ptr.next = getelementptr inbounds i8, i8* %ptr, i32 1
|
||||
store i8 %arg, i8* %ptr.next, align 1
|
||||
%1 = icmp ult i32 %iv.next, 17
|
||||
br i1 %1, label %loop, label %exit
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Check that loop in hoge_5, with a runtime upperbound of 5, is unrolled when -unroll-max-upperbound=4
|
||||
; CHECK-LABEL: hoge_5
|
||||
; CHECK: loop:
|
||||
; CHECK: store
|
||||
; CHECK-NOT: store
|
||||
; CHECK: br i1 %{{.*}}, label %loop
|
||||
; UPPER-LABEL: hoge_5
|
||||
; UPPER: loop:
|
||||
; UPPER: store
|
||||
; UPPER: store
|
||||
; UPPER: store
|
||||
; UPPER: br i1 %{{.*}}, label %loop
|
||||
define dso_local void @hoge_5(i8 %arg) {
|
||||
entry:
|
||||
%x = load i32, i32* @global, align 4
|
||||
%y = load i8*, i8** @global.1, align 4
|
||||
%0 = icmp ult i32 %x, 17
|
||||
br i1 %0, label %loop, label %exit
|
||||
|
||||
loop:
|
||||
%iv = phi i32 [ %x, %entry ], [ %iv.next, %loop ]
|
||||
%ptr = phi i8* [ %y, %entry ], [ %ptr.next, %loop ]
|
||||
%iv.next = add nuw i32 %iv, 4
|
||||
%ptr.next = getelementptr inbounds i8, i8* %ptr, i32 1
|
||||
store i8 %arg, i8* %ptr.next, align 1
|
||||
%1 = icmp ult i32 %iv.next, 17
|
||||
br i1 %1, label %loop, label %exit
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue