[LoopDist] Add llvm.loop.distribute.enable loop metadata

Summary: D19403 adds a new pragma for loop distribution. This change adds support for the corresponding metadata that the pragma is translated to by the FE. As part of this I had to rethink the flag -enable-loop-distribute. My goal was to be backward compatible with the existing behavior: A1. pass is off by default from the optimization pipeline unless -enable-loop-distribute is specified A2. pass is on when invoked directly from opt (e.g. for unit-testing) The new pragma/metadata overrides these defaults so the new behavior is: B1. A1 + enable distribution for individual loop with the pragma/metadata B2. A2 + disable distribution for individual loop with the pragma/metadata The default value whether the pass is on or off comes from the initiator of the pass. From the PassManagerBuilder the default is off, from opt it's on. I moved -enable-loop-distribute under the pass. If the flag is specified it overrides the default from above. Then the pragma/metadata can further modifies this per loop. As a side-effect, we can now also use -enable-loop-distribute=0 from opt to emulate the default from the optimization pipeline. So to be precise this is the new behavior: C1. pass is off by default from the optimization pipeline unless -enable-loop-distribute or the pragma/metadata enables it C2. pass is on when invoked directly from opt unless -enable-loop-distribute=0 or the pragma/metadata disables it Reviewers: hfinkel Subscribers: joker.eph, mzolotukhin, llvm-commits Differential Revision: http://reviews.llvm.org/D19431 llvm-svn: 267672
2016-04-27 05:28:18 +00:00 · 2016-04-27 05:28:18 +00:00 · d2fa414718
parent 08efb0efcd
commit d2fa414718
5 changed files with 247 additions and 13 deletions
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@ -4711,6 +4711,27 @@ which is the string ``llvm.loop.licm_versioning.disable``. For example:

   !0 = !{!"llvm.loop.licm_versioning.disable"}

+'``llvm.loop.distribute.enable``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Loop distribution allows splitting a loop into multiple loops.  Currently,
+this is only performed if the entire loop cannot be vectorized due to unsafe
+memory dependencies.  The transformation will atempt to isolate the unsafe
+dependencies into their own loop.
+
+This metadata can be used to selectively enable or disable distribution of the
+loop.  The first operand is the string ``llvm.loop.distribute.enable`` and the
+second operand is a bit. If the bit operand value is 1 distribution is
+enabled. A value of 0 disables distribution:
+
+.. code-block:: llvm
+
+   !0 = !{!"llvm.loop.distribute.enable", i1 0}
+   !1 = !{!"llvm.loop.distribute.enable", i1 1}
+
+This metadata should be used in conjunction with ``llvm.loop`` loop
+identification metadata.
+
 '``llvm.mem``'
 ^^^^^^^^^^^^^^^

--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@ -479,7 +479,10 @@ FunctionPass *createNaryReassociatePass();
 //
 // LoopDistribute - Distribute loops.
 //
-FunctionPass *createLoopDistributePass();
+// ProcessAllLoopsByDefault instructs the pass to look for distribution
+// opportunities in all loops unless -enable-loop-distribute or the
+// llvm.loop.distribute.enable metadata data override this default.
+FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault);

 //===----------------------------------------------------------------------===//
 //
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@ -95,10 +95,6 @@ static cl::opt<bool> EnableLoopInterchange(
    "enable-loopinterchange", cl::init(false), cl::Hidden,
    cl::desc("Enable the new, experimental LoopInterchange Pass"));

-static cl::opt<bool> EnableLoopDistribute(
-    "enable-loop-distribute", cl::init(false), cl::Hidden,
-    cl::desc("Enable the new, experimental LoopDistribution Pass"));
-
 static cl::opt<bool> EnableNonLTOGlobalsModRef(
    "enable-non-lto-gmr", cl::init(true), cl::Hidden,
    cl::desc(
@ -480,9 +476,10 @@ void PassManagerBuilder::populateModulePassManager(
  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));

  // Distribute loops to allow partial vectorization.  I.e. isolate dependences
-  // into separate loop that would otherwise inhibit vectorization.
-  if (EnableLoopDistribute)
-    MPM.add(createLoopDistributePass());
+  // into separate loop that would otherwise inhibit vectorization.  This is
+  // currently only performed for loops marked with the metadata
+  // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
+  MPM.add(createLoopDistributePass(/*ProcessAllLoopsByDefault=*/false));

  MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));

--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@ -60,6 +60,19 @@ static cl::opt<unsigned> DistributeSCEVCheckThreshold(
    cl::desc("The maximum number of SCEV checks allowed for Loop "
             "Distribution"));

+static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
+    "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
+    cl::Hidden,
+    cl::desc(
+        "The maximum number of SCEV checks allowed for Loop "
+        "Distribution for loop marked with #pragma loop distribute(enable)"));
+
+// Note that the initial value for this depends on whether the pass is invoked
+// directly or from the optimization pipeline.
+static cl::opt<bool> EnableLoopDistribute(
+    "enable-loop-distribute", cl::Hidden,
+    cl::desc("Enable the new, experimental LoopDistribution Pass"));
+
 STATISTIC(NumLoopsDistributed, "Number of loops distributed");

 namespace {
@ -576,7 +589,9 @@ class LoopDistributeForLoop {
 public:
  LoopDistributeForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
                        DominatorTree *DT, ScalarEvolution *SE)
-      : L(L), LI(LI), LAI(LAI), DT(DT), SE(SE) {}
+      : L(L), LI(LI), LAI(LAI), DT(DT), SE(SE) {
+    setForced();
+  }

  /// \brief Try to distribute an inner-most loop.
  bool processLoop() {
@ -683,7 +698,9 @@ public:

    // Don't distribute the loop if we need too many SCEV run-time checks.
    const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
-    if (Pred.getComplexity() > DistributeSCEVCheckThreshold) {
+    if (Pred.getComplexity() > (IsForced.getValueOr(false)
+                                    ? PragmaDistributeSCEVCheckThreshold
+                                    : DistributeSCEVCheckThreshold)) {
      DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
      return false;
    }
@ -735,6 +752,13 @@ public:
    return true;
  }

+  /// \brief Return if distribution forced to be enabled/disabled for the loop.
+  ///
+  /// If the optional has a value, it indicates whether distribution was forced
+  /// to be enabled (true) or disabled (false).  If the optional has no value
+  /// distribution was not forced either way.
+  const Optional<bool> &isForced() const { return IsForced; }
+
 private:
  /// \brief Filter out checks between pointers from the same partition.
  ///
@ -775,18 +799,47 @@ private:
    return Checks;
  }

+  /// \brief Check whether the loop metadata is forcing distribution to be
+  /// enabled/disabled.
+  void setForced() {
+    Optional<const MDOperand *> Value =
+        findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
+    if (!Value)
+      return;
+
+    const MDOperand *Op = *Value;
+    assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
+    IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
+  }
+
  // Analyses used.
  Loop *L;
  LoopInfo *LI;
  const LoopAccessInfo &LAI;
  DominatorTree *DT;
  ScalarEvolution *SE;
+
+  /// \brief Indicates whether distribution is forced to be enabled/disabled for
+  /// the loop.
+  ///
+  /// If the optional has a value, it indicates whether distribution was forced
+  /// to be enabled (true) or disabled (false).  If the optional has no value
+  /// distribution was not forced either way.
+  Optional<bool> IsForced;
 };

 /// \brief The pass class.
 class LoopDistribute : public FunctionPass {
 public:
-  LoopDistribute() : FunctionPass(ID) {
+  /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be
+  /// performed by default.  Pass -enable-loop-distribute={0,1} overrides this
+  /// default.  We use this to keep LoopDistribution off by default when invoked
+  /// from the optimization pipeline but on when invoked explicitly from opt.
+  LoopDistribute(bool ProcessAllLoopsByDefault = true)
+      : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) {
+    // The default is set by the caller.
+    if (EnableLoopDistribute.getNumOccurrences() > 0)
+      ProcessAllLoops = EnableLoopDistribute;
    initializeLoopDistributePass(*PassRegistry::getPassRegistry());
  }

@ -812,7 +865,11 @@ public:
    for (Loop *L : Worklist) {
      const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
      LoopDistributeForLoop LDL(L, LI, LAI, DT, SE);
-      Changed |= LDL.processLoop();
+
+      // If distribution was forced for the specific loop to be
+      // enabled/disabled, follow that.  Otherwise use the global flag.
+      if (LDL.isForced().getValueOr(ProcessAllLoops))
+        Changed |= LDL.processLoop();
    }

    // Process each loop nest in the function.
@ -829,6 +886,11 @@ public:
  }

  static char ID;
+
+private:
+  /// \brief Whether distribution should be on in this function.  The per-loop
+  /// pragma can override this.
+  bool ProcessAllLoops;
 };
 } // anonymous namespace

@ -843,5 +905,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)

 namespace llvm {
-FunctionPass *createLoopDistributePass() { return new LoopDistribute(); }
+FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) {
+  return new LoopDistribute(ProcessAllLoopsByDefault);
+}
 }
--- a/llvm/test/Transforms/LoopDistribute/metadata.ll
+++ b/llvm/test/Transforms/LoopDistribute/metadata.ll
@ -0,0 +1,149 @@
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute=0 -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=EXPLICIT --check-prefix=DEFAULT_OFF
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute=1 -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=EXPLICIT --check-prefix=DEFAULT_ON
+
+; Same loop as in basic.ll.  Check that distribution is enabled/disabled
+; properly according to -enable-loop-distribute=0/1 and the
+; llvm.loop.distribute.enable metadata.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; CHECK-LABEL: @explicit_on(
+define void @explicit_on(i32* noalias %a,
+                         i32* noalias %b,
+                         i32* noalias %c,
+                         i32* noalias %d,
+                         i32* noalias %e) {
+entry:
+  br label %for.body
+
+; EXPLICIT: for.body.ldist1:
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; CHECK-LABEL: @explicit_off(
+define void @explicit_off(i32* noalias %a,
+                         i32* noalias %b,
+                         i32* noalias %c,
+                         i32* noalias %d,
+                         i32* noalias %e) {
+entry:
+  br label %for.body
+
+; EXPLICIT-NOT: for.body.ldist1:
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; CHECK-LABEL: @default_distribute(
+define void @default_distribute(i32* noalias %a,
+               i32* noalias %b,
+               i32* noalias %c,
+               i32* noalias %d,
+               i32* noalias %e) {
+entry:
+  br label %for.body
+
+; Verify the two distributed loops.
+
+; DEFAULT_ON: for.body.ldist1:
+; DEFAULT_OFF-NOT: for.body.ldist1:
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.distribute.enable", i1 true}
+!2 = distinct !{!2, !3}
+!3 = !{!"llvm.loop.distribute.enable", i1 false}