[Passes] Run peeling as part of simple/full loop unrolling.

Loop peeling removes conditions from loop bodies that become invariant after a small number of iterations. When triggered, this leads to fewer compares and possibly PHIs in loop bodies, enabling further optimizations. The current cost-model of loop peeling should be quite conservative/safe, i.e. only peel if a condition in the loop becomes known after peeling. For example, see PR47671, where loop peeling enables vectorization by removing a PHI the vectorizer does not understand. Granted, the loop-vectorizer could also be taught about constant PHIs, but loop peeling is likely to enable other optimizations as well. This has an impact on quite a few benchmarks from MultiSource/SPEC2000/SPEC2006 on X86 with -O3 -flto, for example Same hash: 186 (filtered out) Remaining: 51 Metric: loop-vectorize.LoopsVectorized Program base patch diff test-suite...ve-susan/automotive-susan.test 8.00 9.00 12.5% test-suite...nal/skidmarks10/skidmarks.test 35.00 31.00 -11.4% test-suite...lications/sqlite3/sqlite3.test 41.00 43.00 4.9% test-suite...s/ASC_Sequoia/AMGmk/AMGmk.test 25.00 26.00 4.0% test-suite...006/450.soplex/450.soplex.test 88.00 89.00 1.1% test-suite...TimberWolfMC/timberwolfmc.test 120.00 119.00 -0.8% test-suite.../CINT2006/403.gcc/403.gcc.test 215.00 216.00 0.5% test-suite...006/447.dealII/447.dealII.test 957.00 958.00 0.1% test-suite...ternal/HMMER/hmmcalibrate.test 75.00 75.00 0.0% Same hash: 186 (filtered out) Remaining: 51 Metric: loop-vectorize.LoopsAnalyzed Program base patch diff test-suite...ks/Prolangs-C/agrep/agrep.test 440.00 434.00 -1.4% test-suite...nal/skidmarks10/skidmarks.test 312.00 308.00 -1.3% test-suite...marks/7zip/7zip-benchmark.test 6399.00 6323.00 -1.2% test-suite...lications/minisat/minisat.test 134.00 135.00 0.7% test-suite...rks/FreeBench/pifft/pifft.test 295.00 297.00 0.7% test-suite...TimberWolfMC/timberwolfmc.test 1879.00 1869.00 -0.5% test-suite...pplications/treecc/treecc.test 689.00 691.00 0.3% test-suite...T2000/300.twolf/300.twolf.test 1593.00 1597.00 0.3% test-suite.../Benchmarks/Bullet/bullet.test 1394.00 1392.00 -0.1% test-suite...ications/JM/ldecod/ldecod.test 1431.00 1429.00 -0.1% test-suite...6/464.h264ref/464.h264ref.test 2229.00 2230.00 0.0% test-suite...lications/sqlite3/sqlite3.test 2590.00 2589.00 -0.0% test-suite...ications/JM/lencod/lencod.test 2732.00 2733.00 0.0% test-suite...006/453.povray/453.povray.test 3395.00 3394.00 -0.0% Note the -11% regression in number of loops vectorized for skidmarks. I suspect this corresponds to the fact that those loops are gone now (see the reduction in number of loops analyzed by LV). Reviewed By: lebedev.ri Differential Revision: https://reviews.llvm.org/D88471
2021-01-26 13:43:39 +00:00 · 2021-01-26 13:43:39 +00:00 · 35b3989a30
parent 4dc110a4b8
commit 35b3989a30
5 changed files with 10 additions and 7 deletions
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@ -190,7 +190,8 @@ Pass *createLoopUnrollPass(int OptLevel = 2, bool OnlyWhenForced = false,
                           int Count = -1, int AllowPartial = -1,
                           int Runtime = -1, int UpperBound = -1,
                           int AllowPeeling = -1);
-// Create an unrolling pass for full unrolling that uses exact trip count only.
+// Create an unrolling pass for full unrolling that uses exact trip count only
+// and also does peeling.
 Pass *createSimpleLoopUnrollPass(int OptLevel = 2, bool OnlyWhenForced = false,
                                 bool ForgetAllSCEV = false);

--- a/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h
@ -22,7 +22,7 @@ class Function;
 class Loop;
 class LPMUpdater;

-/// Loop unroll pass that only does full loop unrolling.
+/// Loop unroll pass that only does full loop unrolling and peeling.
 class LoopFullUnrollPass : public PassInfoMixin<LoopFullUnrollPass> {
  const int OptLevel;

--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@ -458,7 +458,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
  if (EnableLoopInterchange)
    MPM.add(createLoopInterchangePass()); // Interchange loops

-  // Unroll small loops
+  // Unroll small loops and perform peeling.
  MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
                                     ForgetAllSCEVInLoopUnroll));
  addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
@ -1072,7 +1072,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
  if (EnableConstraintElimination)
    PM.add(createConstraintEliminationPass());

-  // Unroll small loops
+  // Unroll small loops and perform peeling.
  PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
                                    ForgetAllSCEVInLoopUnroll));
  PM.add(createLoopDistributePass());
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@ -1301,7 +1301,7 @@ Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
 Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
                                       bool ForgetAllSCEV) {
  return createLoopUnrollPass(OptLevel, OnlyWhenForced, ForgetAllSCEV, -1, -1,
-                              0, 0, 0, 0);
+                              0, 0, 0, 1);
 }

 PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
@ -1329,7 +1329,7 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
                                 OnlyWhenForced, ForgetSCEV, /*Count*/ None,
                                 /*Threshold*/ None, /*AllowPartial*/ false,
                                 /*Runtime*/ false, /*UpperBound*/ false,
-                                 /*AllowPeeling*/ false,
+                                 /*AllowPeeling*/ true,
                                 /*AllowProfileBasedPeeling*/ false,
                                 /*FullUnrollMaxCount*/ None) !=
                 LoopUnrollResult::Unmodified;
--- a/llvm/test/Transforms/PhaseOrdering/X86/peel-before-lv-to-enable-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/peel-before-lv-to-enable-vectorization.ll
@ -11,7 +11,9 @@ target triple = "x86_64-apple-macosx"

 define i32 @test(i32* readonly %p, i32* readnone %q) {
 ; CHECK-LABEL: define i32 @test(
-; CHECK-NOT: vector.body
+; CHECK: vector.body:
+; CHECK:   %index.next = add i64 %index, 8
+; CHECK: middle.block:
 ;
 entry:
  %cmp.not7 = icmp eq i32* %p, %q