Add option to assume single-loop scops with sufficient compute are profitable

If a loop has a sufficiently large amount of compute instruction in its loop body, it is unlikely that our rewrite of the loop iterators introduces large performance changes. As Polly can also apply beneficical optimizations (such as parallelization) to such loop nests, we mark them as profitable. This option is currently "disabled" by default, but can be used to run experiments. If enabled by setting it e.g. to 40 instructions, we currently see some compile-time increases on LNT without any significant run-time changes. llvm-svn: 256199
2015-12-21 21:00:43 +00:00 · 2015-12-21 21:00:43 +00:00 · c1a269bf0e
parent 13da1f149e
commit c1a269bf0e
3 changed files with 135 additions and 4 deletions
--- a/polly/include/polly/ScopDetection.h
+++ b/polly/include/polly/ScopDetection.h
@ -287,6 +287,20 @@ private:
  /// @return True if all blocks in R are valid, false otherwise.
  bool allBlocksValid(DetectionContext &Context) const;

+  /// @brief Check if a region has sufficient compute instructions
+  ///
+  /// This function checks if a region has a non-trivial number of instructions
+  /// in each loop. This can be used as an indicator if a loop is worth
+  /// optimising.
+  ///
+  /// @param Context  The context of scop detection.
+  /// @param NumLoops The number of loops in the region.
+  ///
+  /// @return True if region is has sufficient compute instructions,
+  ///         false otherwise.
+  bool hasSufficientCompute(DetectionContext &Context,
+                            int NumAffineLoops) const;
+
  /// @brief Check if a region is profitable to optimize.
  ///
  /// Regions that are unlikely to expose interesting optimization opportunities
--- a/polly/lib/Analysis/ScopDetection.cpp
+++ b/polly/lib/Analysis/ScopDetection.cpp
@ -71,6 +71,16 @@ using namespace polly;

 #define DEBUG_TYPE "polly-detect"

+// This option is set to a very high value, as analyzing such loops increases
+// compile time on several cases. For experiments that enable this option,
+// a value of around 40 has been working to avoid run-time regressions with
+// Polly while still exposing interesting optimization opportunities.
+static cl::opt<int> ProfitabilityMinPerLoopInstructions(
+    "polly-detect-profitability-min-per-loop-insts",
+    cl::desc("The minimal number of per-loop instructions before a single loop "
+             "region is considered profitable"),
+    cl::Hidden, cl::ValueRequired, cl::init(100000000), cl::cat(PollyCategory));
+
 bool polly::PollyProcessUnprofitable;
 static cl::opt<bool, true> XPollyProcessUnprofitable(
    "polly-process-unprofitable",
@ -1134,6 +1144,19 @@ bool ScopDetection::allBlocksValid(DetectionContext &Context) const {
  return true;
 }

+bool ScopDetection::hasSufficientCompute(DetectionContext &Context,
+                                         int NumLoops) const {
+  int InstCount = 0;
+
+  for (auto *BB : Context.CurRegion.blocks())
+    if (Context.CurRegion.contains(LI->getLoopFor(BB)))
+      InstCount += std::distance(BB->begin(), BB->end());
+
+  InstCount = InstCount / NumLoops;
+
+  return InstCount >= ProfitabilityMinPerLoopInstructions;
+}
+
 bool ScopDetection::isProfitableRegion(DetectionContext &Context) const {
  Region &CurRegion = Context.CurRegion;

@ -1145,13 +1168,24 @@ bool ScopDetection::isProfitableRegion(DetectionContext &Context) const {
  if (!Context.hasStores || !Context.hasLoads)
    return invalid<ReportUnprofitable>(Context, /*Assert=*/true, &CurRegion);

-  // Check if there are sufficent non-overapproximated loops.
  int NumLoops = countBeneficialLoops(&CurRegion);
  int NumAffineLoops = NumLoops - Context.BoxedLoopsSet.size();
-  if (NumAffineLoops < 2)
-    return invalid<ReportUnprofitable>(Context, /*Assert=*/true, &CurRegion);

+  // Scops with at least two loops may allow either loop fusion or tiling and
+  // are consequently interesting to look at.
+  if (NumAffineLoops >= 2)
    return true;
+
+  // Scops that contain a loop with a non-trivial amount of computation per
+  // loop-iteration are interesting as we may be able to parallelize such
+  // loops. Individual loops that have only a small amount of computation
+  // per-iteration are performance-wise very fragile as any change to the
+  // loop induction variables may affect performance. To not cause spurious
+  // performance regressions, we do not consider such loops.
+  if (NumAffineLoops == 1 && hasSufficientCompute(Context, NumLoops))
+    return true;
+
+  return invalid<ReportUnprofitable>(Context, /*Assert=*/true, &CurRegion);
 }

 bool ScopDetection::isValidRegion(DetectionContext &Context) const {
--- a/polly/test/ScopDetect/profitability-large-basic-blocks.ll
+++ b/polly/test/ScopDetect/profitability-large-basic-blocks.ll
@ -0,0 +1,83 @@
+; RUN: opt %loadPolly -polly-process-unprofitable=false \
+; RUN:                -polly-detect-profitability-min-per-loop-insts=40 \
+; RUN: -polly-detect -analyze < %s | FileCheck %s -check-prefix=PROFITABLE
+
+; RUN: opt %loadPolly -polly-process-unprofitable=true \
+; RUN: -polly-detect -analyze < %s | FileCheck %s -check-prefix=PROFITABLE
+
+; RUN: opt %loadPolly -polly-process-unprofitable=false \
+; RUN: \
+; RUN: -polly-detect -analyze < %s | FileCheck %s -check-prefix=UNPROFITABLE
+
+; UNPROFITABLE-NOT: Valid Region for Scop:
+; PROFITABLE: Valid Region for Scop:
+
+;    void foo(float *A, float *B, long N) {
+;      for (long i = 0; i < 100; i++)
+;          A[i] += .... / * a  lot of compute */
+;    }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(float* %A, float* %B, i64 %N) {
+entry:
+  br label %header
+
+header:
+  %i.0 = phi i64 [ 0, %entry ], [ %tmp10, %header ]
+  %tmp5 = sitofp i64 %i.0 to float
+  %tmp6 = getelementptr inbounds float, float* %A, i64 %i.0
+  %tmp7 = load float, float* %tmp6, align 4
+  %tmp8 = fadd float %tmp7, %tmp5
+  %val0 = fadd float %tmp7, 1.0
+  %val1 = fadd float %val0, 1.0
+  %val2 = fadd float %val1, 1.0
+  %val3 = fadd float %val2, 1.0
+  %val4 = fadd float %val3, 1.0
+  %val5 = fadd float %val4, 1.0
+  %val6 = fadd float %val5, 1.0
+  %val7 = fadd float %val6, 1.0
+  %val8 = fadd float %val7, 1.0
+  %val9 = fadd float %val8, 1.0
+  %val10 = fadd float %val9, 1.0
+  %val11 = fadd float %val10, 1.0
+  %val12 = fadd float %val11, 1.0
+  %val13 = fadd float %val12, 1.0
+  %val14 = fadd float %val13, 1.0
+  %val15 = fadd float %val14, 1.0
+  %val16 = fadd float %val15, 1.0
+  %val17 = fadd float %val16, 1.0
+  %val18 = fadd float %val17, 1.0
+  %val19 = fadd float %val18, 1.0
+  %val20 = fadd float %val19, 1.0
+  %val21 = fadd float %val20, 1.0
+  %val22 = fadd float %val21, 1.0
+  %val23 = fadd float %val22, 1.0
+  %val24 = fadd float %val23, 1.0
+  %val25 = fadd float %val24, 1.0
+  %val26 = fadd float %val25, 1.0
+  %val27 = fadd float %val26, 1.0
+  %val28 = fadd float %val27, 1.0
+  %val29 = fadd float %val28, 1.0
+  %val30 = fadd float %val29, 1.0
+  %val31 = fadd float %val30, 1.0
+  %val32 = fadd float %val31, 1.0
+  %val33 = fadd float %val32, 1.0
+  %val34 = fadd float %val33, 1.0
+  %val35 = fadd float %val34, 1.0
+  %val36 = fadd float %val35, 1.0
+  %val37 = fadd float %val36, 1.0
+  %val38 = fadd float %val37, 1.0
+  %val39 = fadd float %val38, 1.0
+  %val40 = fadd float %val39, 1.0
+  %val41 = fadd float %val40, 1.0
+  %val42 = fadd float %val41, 1.0
+  %val43 = fadd float %val42, 1.0
+  store float %val34, float* %tmp6, align 4
+  %exitcond = icmp ne i64 %i.0, 100
+  %tmp10 = add nsw i64 %i.0, 1
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}