Add option to assume single-loop scops with sufficient compute are profitable

If a loop has a sufficiently large amount of compute instruction in its loop
body, it is unlikely that our rewrite of the loop iterators introduces large
performance changes. As Polly can also apply beneficical optimizations (such
as parallelization) to such loop nests, we mark them as profitable.

This option is currently "disabled" by default, but can be used to run
experiments. If enabled by setting it e.g. to 40 instructions, we currently
see some compile-time increases on LNT without any significant run-time
changes.

llvm-svn: 256199
This commit is contained in:
Tobias Grosser 2015-12-21 21:00:43 +00:00
parent 13da1f149e
commit c1a269bf0e
3 changed files with 135 additions and 4 deletions

View File

@ -287,6 +287,20 @@ private:
/// @return True if all blocks in R are valid, false otherwise.
bool allBlocksValid(DetectionContext &Context) const;
/// @brief Check if a region has sufficient compute instructions
///
/// This function checks if a region has a non-trivial number of instructions
/// in each loop. This can be used as an indicator if a loop is worth
/// optimising.
///
/// @param Context The context of scop detection.
/// @param NumLoops The number of loops in the region.
///
/// @return True if region is has sufficient compute instructions,
/// false otherwise.
bool hasSufficientCompute(DetectionContext &Context,
int NumAffineLoops) const;
/// @brief Check if a region is profitable to optimize.
///
/// Regions that are unlikely to expose interesting optimization opportunities

View File

@ -71,6 +71,16 @@ using namespace polly;
#define DEBUG_TYPE "polly-detect"
// This option is set to a very high value, as analyzing such loops increases
// compile time on several cases. For experiments that enable this option,
// a value of around 40 has been working to avoid run-time regressions with
// Polly while still exposing interesting optimization opportunities.
static cl::opt<int> ProfitabilityMinPerLoopInstructions(
"polly-detect-profitability-min-per-loop-insts",
cl::desc("The minimal number of per-loop instructions before a single loop "
"region is considered profitable"),
cl::Hidden, cl::ValueRequired, cl::init(100000000), cl::cat(PollyCategory));
bool polly::PollyProcessUnprofitable;
static cl::opt<bool, true> XPollyProcessUnprofitable(
"polly-process-unprofitable",
@ -1134,6 +1144,19 @@ bool ScopDetection::allBlocksValid(DetectionContext &Context) const {
return true;
}
bool ScopDetection::hasSufficientCompute(DetectionContext &Context,
int NumLoops) const {
int InstCount = 0;
for (auto *BB : Context.CurRegion.blocks())
if (Context.CurRegion.contains(LI->getLoopFor(BB)))
InstCount += std::distance(BB->begin(), BB->end());
InstCount = InstCount / NumLoops;
return InstCount >= ProfitabilityMinPerLoopInstructions;
}
bool ScopDetection::isProfitableRegion(DetectionContext &Context) const {
Region &CurRegion = Context.CurRegion;
@ -1145,13 +1168,24 @@ bool ScopDetection::isProfitableRegion(DetectionContext &Context) const {
if (!Context.hasStores || !Context.hasLoads)
return invalid<ReportUnprofitable>(Context, /*Assert=*/true, &CurRegion);
// Check if there are sufficent non-overapproximated loops.
int NumLoops = countBeneficialLoops(&CurRegion);
int NumAffineLoops = NumLoops - Context.BoxedLoopsSet.size();
if (NumAffineLoops < 2)
return invalid<ReportUnprofitable>(Context, /*Assert=*/true, &CurRegion);
return true;
// Scops with at least two loops may allow either loop fusion or tiling and
// are consequently interesting to look at.
if (NumAffineLoops >= 2)
return true;
// Scops that contain a loop with a non-trivial amount of computation per
// loop-iteration are interesting as we may be able to parallelize such
// loops. Individual loops that have only a small amount of computation
// per-iteration are performance-wise very fragile as any change to the
// loop induction variables may affect performance. To not cause spurious
// performance regressions, we do not consider such loops.
if (NumAffineLoops == 1 && hasSufficientCompute(Context, NumLoops))
return true;
return invalid<ReportUnprofitable>(Context, /*Assert=*/true, &CurRegion);
}
bool ScopDetection::isValidRegion(DetectionContext &Context) const {

View File

@ -0,0 +1,83 @@
; RUN: opt %loadPolly -polly-process-unprofitable=false \
; RUN: -polly-detect-profitability-min-per-loop-insts=40 \
; RUN: -polly-detect -analyze < %s | FileCheck %s -check-prefix=PROFITABLE
; RUN: opt %loadPolly -polly-process-unprofitable=true \
; RUN: -polly-detect -analyze < %s | FileCheck %s -check-prefix=PROFITABLE
; RUN: opt %loadPolly -polly-process-unprofitable=false \
; RUN: \
; RUN: -polly-detect -analyze < %s | FileCheck %s -check-prefix=UNPROFITABLE
; UNPROFITABLE-NOT: Valid Region for Scop:
; PROFITABLE: Valid Region for Scop:
; void foo(float *A, float *B, long N) {
; for (long i = 0; i < 100; i++)
; A[i] += .... / * a lot of compute */
; }
;
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @foo(float* %A, float* %B, i64 %N) {
entry:
br label %header
header:
%i.0 = phi i64 [ 0, %entry ], [ %tmp10, %header ]
%tmp5 = sitofp i64 %i.0 to float
%tmp6 = getelementptr inbounds float, float* %A, i64 %i.0
%tmp7 = load float, float* %tmp6, align 4
%tmp8 = fadd float %tmp7, %tmp5
%val0 = fadd float %tmp7, 1.0
%val1 = fadd float %val0, 1.0
%val2 = fadd float %val1, 1.0
%val3 = fadd float %val2, 1.0
%val4 = fadd float %val3, 1.0
%val5 = fadd float %val4, 1.0
%val6 = fadd float %val5, 1.0
%val7 = fadd float %val6, 1.0
%val8 = fadd float %val7, 1.0
%val9 = fadd float %val8, 1.0
%val10 = fadd float %val9, 1.0
%val11 = fadd float %val10, 1.0
%val12 = fadd float %val11, 1.0
%val13 = fadd float %val12, 1.0
%val14 = fadd float %val13, 1.0
%val15 = fadd float %val14, 1.0
%val16 = fadd float %val15, 1.0
%val17 = fadd float %val16, 1.0
%val18 = fadd float %val17, 1.0
%val19 = fadd float %val18, 1.0
%val20 = fadd float %val19, 1.0
%val21 = fadd float %val20, 1.0
%val22 = fadd float %val21, 1.0
%val23 = fadd float %val22, 1.0
%val24 = fadd float %val23, 1.0
%val25 = fadd float %val24, 1.0
%val26 = fadd float %val25, 1.0
%val27 = fadd float %val26, 1.0
%val28 = fadd float %val27, 1.0
%val29 = fadd float %val28, 1.0
%val30 = fadd float %val29, 1.0
%val31 = fadd float %val30, 1.0
%val32 = fadd float %val31, 1.0
%val33 = fadd float %val32, 1.0
%val34 = fadd float %val33, 1.0
%val35 = fadd float %val34, 1.0
%val36 = fadd float %val35, 1.0
%val37 = fadd float %val36, 1.0
%val38 = fadd float %val37, 1.0
%val39 = fadd float %val38, 1.0
%val40 = fadd float %val39, 1.0
%val41 = fadd float %val40, 1.0
%val42 = fadd float %val41, 1.0
%val43 = fadd float %val42, 1.0
store float %val34, float* %tmp6, align 4
%exitcond = icmp ne i64 %i.0, 100
%tmp10 = add nsw i64 %i.0, 1
br i1 %exitcond, label %header, label %exit
exit:
ret void
}