[LoopDist] Add llvm.loop.distribute.enable loop metadata

Summary:
D19403 adds a new pragma for loop distribution.  This change adds
support for the corresponding metadata that the pragma is translated to
by the FE.

As part of this I had to rethink the flag -enable-loop-distribute.  My
goal was to be backward compatible with the existing behavior:

  A1. pass is off by default from the optimization pipeline
  unless -enable-loop-distribute is specified

  A2. pass is on when invoked directly from opt (e.g. for unit-testing)

The new pragma/metadata overrides these defaults so the new behavior is:

  B1. A1 + enable distribution for individual loop with the pragma/metadata

  B2. A2 + disable distribution for individual loop with the pragma/metadata

The default value whether the pass is on or off comes from the initiator
of the pass.  From the PassManagerBuilder the default is off, from opt
it's on.

I moved -enable-loop-distribute under the pass.  If the flag is
specified it overrides the default from above.

Then the pragma/metadata can further modifies this per loop.

As a side-effect, we can now also use -enable-loop-distribute=0 from opt
to emulate the default from the optimization pipeline.  So to be precise
this is the new behavior:

  C1. pass is off by default from the optimization pipeline
  unless -enable-loop-distribute or the pragma/metadata enables it

  C2. pass is on when invoked directly from opt
  unless -enable-loop-distribute=0 or the pragma/metadata disables it

Reviewers: hfinkel

Subscribers: joker.eph, mzolotukhin, llvm-commits

Differential Revision: http://reviews.llvm.org/D19431

llvm-svn: 267672
This commit is contained in:
Adam Nemet 2016-04-27 05:28:18 +00:00
parent 08efb0efcd
commit d2fa414718
5 changed files with 247 additions and 13 deletions

View File

@ -4711,6 +4711,27 @@ which is the string ``llvm.loop.licm_versioning.disable``. For example:
!0 = !{!"llvm.loop.licm_versioning.disable"}
'``llvm.loop.distribute.enable``' Metadata
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Loop distribution allows splitting a loop into multiple loops. Currently,
this is only performed if the entire loop cannot be vectorized due to unsafe
memory dependencies. The transformation will atempt to isolate the unsafe
dependencies into their own loop.
This metadata can be used to selectively enable or disable distribution of the
loop. The first operand is the string ``llvm.loop.distribute.enable`` and the
second operand is a bit. If the bit operand value is 1 distribution is
enabled. A value of 0 disables distribution:
.. code-block:: llvm
!0 = !{!"llvm.loop.distribute.enable", i1 0}
!1 = !{!"llvm.loop.distribute.enable", i1 1}
This metadata should be used in conjunction with ``llvm.loop`` loop
identification metadata.
'``llvm.mem``'
^^^^^^^^^^^^^^^

View File

@ -479,7 +479,10 @@ FunctionPass *createNaryReassociatePass();
//
// LoopDistribute - Distribute loops.
//
FunctionPass *createLoopDistributePass();
// ProcessAllLoopsByDefault instructs the pass to look for distribution
// opportunities in all loops unless -enable-loop-distribute or the
// llvm.loop.distribute.enable metadata data override this default.
FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault);
//===----------------------------------------------------------------------===//
//

View File

@ -95,10 +95,6 @@ static cl::opt<bool> EnableLoopInterchange(
"enable-loopinterchange", cl::init(false), cl::Hidden,
cl::desc("Enable the new, experimental LoopInterchange Pass"));
static cl::opt<bool> EnableLoopDistribute(
"enable-loop-distribute", cl::init(false), cl::Hidden,
cl::desc("Enable the new, experimental LoopDistribution Pass"));
static cl::opt<bool> EnableNonLTOGlobalsModRef(
"enable-non-lto-gmr", cl::init(true), cl::Hidden,
cl::desc(
@ -480,9 +476,10 @@ void PassManagerBuilder::populateModulePassManager(
MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
// Distribute loops to allow partial vectorization. I.e. isolate dependences
// into separate loop that would otherwise inhibit vectorization.
if (EnableLoopDistribute)
MPM.add(createLoopDistributePass());
// into separate loop that would otherwise inhibit vectorization. This is
// currently only performed for loops marked with the metadata
// llvm.loop.distribute=true or when -enable-loop-distribute is specified.
MPM.add(createLoopDistributePass(/*ProcessAllLoopsByDefault=*/false));
MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));

View File

@ -60,6 +60,19 @@ static cl::opt<unsigned> DistributeSCEVCheckThreshold(
cl::desc("The maximum number of SCEV checks allowed for Loop "
"Distribution"));
static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
"loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
cl::Hidden,
cl::desc(
"The maximum number of SCEV checks allowed for Loop "
"Distribution for loop marked with #pragma loop distribute(enable)"));
// Note that the initial value for this depends on whether the pass is invoked
// directly or from the optimization pipeline.
static cl::opt<bool> EnableLoopDistribute(
"enable-loop-distribute", cl::Hidden,
cl::desc("Enable the new, experimental LoopDistribution Pass"));
STATISTIC(NumLoopsDistributed, "Number of loops distributed");
namespace {
@ -576,7 +589,9 @@ class LoopDistributeForLoop {
public:
LoopDistributeForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
DominatorTree *DT, ScalarEvolution *SE)
: L(L), LI(LI), LAI(LAI), DT(DT), SE(SE) {}
: L(L), LI(LI), LAI(LAI), DT(DT), SE(SE) {
setForced();
}
/// \brief Try to distribute an inner-most loop.
bool processLoop() {
@ -683,7 +698,9 @@ public:
// Don't distribute the loop if we need too many SCEV run-time checks.
const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
if (Pred.getComplexity() > DistributeSCEVCheckThreshold) {
if (Pred.getComplexity() > (IsForced.getValueOr(false)
? PragmaDistributeSCEVCheckThreshold
: DistributeSCEVCheckThreshold)) {
DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
return false;
}
@ -735,6 +752,13 @@ public:
return true;
}
/// \brief Return if distribution forced to be enabled/disabled for the loop.
///
/// If the optional has a value, it indicates whether distribution was forced
/// to be enabled (true) or disabled (false). If the optional has no value
/// distribution was not forced either way.
const Optional<bool> &isForced() const { return IsForced; }
private:
/// \brief Filter out checks between pointers from the same partition.
///
@ -775,18 +799,47 @@ private:
return Checks;
}
/// \brief Check whether the loop metadata is forcing distribution to be
/// enabled/disabled.
void setForced() {
Optional<const MDOperand *> Value =
findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
if (!Value)
return;
const MDOperand *Op = *Value;
assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
}
// Analyses used.
Loop *L;
LoopInfo *LI;
const LoopAccessInfo &LAI;
DominatorTree *DT;
ScalarEvolution *SE;
/// \brief Indicates whether distribution is forced to be enabled/disabled for
/// the loop.
///
/// If the optional has a value, it indicates whether distribution was forced
/// to be enabled (true) or disabled (false). If the optional has no value
/// distribution was not forced either way.
Optional<bool> IsForced;
};
/// \brief The pass class.
class LoopDistribute : public FunctionPass {
public:
LoopDistribute() : FunctionPass(ID) {
/// \p ProcessAllLoopsByDefault specifies whether loop distribution should be
/// performed by default. Pass -enable-loop-distribute={0,1} overrides this
/// default. We use this to keep LoopDistribution off by default when invoked
/// from the optimization pipeline but on when invoked explicitly from opt.
LoopDistribute(bool ProcessAllLoopsByDefault = true)
: FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) {
// The default is set by the caller.
if (EnableLoopDistribute.getNumOccurrences() > 0)
ProcessAllLoops = EnableLoopDistribute;
initializeLoopDistributePass(*PassRegistry::getPassRegistry());
}
@ -812,7 +865,11 @@ public:
for (Loop *L : Worklist) {
const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
LoopDistributeForLoop LDL(L, LI, LAI, DT, SE);
Changed |= LDL.processLoop();
// If distribution was forced for the specific loop to be
// enabled/disabled, follow that. Otherwise use the global flag.
if (LDL.isForced().getValueOr(ProcessAllLoops))
Changed |= LDL.processLoop();
}
// Process each loop nest in the function.
@ -829,6 +886,11 @@ public:
}
static char ID;
private:
/// \brief Whether distribution should be on in this function. The per-loop
/// pragma can override this.
bool ProcessAllLoops;
};
} // anonymous namespace
@ -843,5 +905,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
namespace llvm {
FunctionPass *createLoopDistributePass() { return new LoopDistribute(); }
FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) {
return new LoopDistribute(ProcessAllLoopsByDefault);
}
}

View File

@ -0,0 +1,149 @@
; RUN: opt -basicaa -loop-distribute -enable-loop-distribute=0 -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=EXPLICIT --check-prefix=DEFAULT_OFF
; RUN: opt -basicaa -loop-distribute -enable-loop-distribute=1 -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=EXPLICIT --check-prefix=DEFAULT_ON
; Same loop as in basic.ll. Check that distribution is enabled/disabled
; properly according to -enable-loop-distribute=0/1 and the
; llvm.loop.distribute.enable metadata.
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.10.0"
; CHECK-LABEL: @explicit_on(
define void @explicit_on(i32* noalias %a,
i32* noalias %b,
i32* noalias %c,
i32* noalias %d,
i32* noalias %e) {
entry:
br label %for.body
; EXPLICIT: for.body.ldist1:
for.body: ; preds = %for.body, %entry
%ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
%arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
%loadA = load i32, i32* %arrayidxA, align 4
%arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
%loadB = load i32, i32* %arrayidxB, align 4
%mulA = mul i32 %loadB, %loadA
%add = add nuw nsw i64 %ind, 1
%arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
store i32 %mulA, i32* %arrayidxA_plus_4, align 4
%arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
%loadD = load i32, i32* %arrayidxD, align 4
%arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
%loadE = load i32, i32* %arrayidxE, align 4
%mulC = mul i32 %loadD, %loadE
%arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
store i32 %mulC, i32* %arrayidxC, align 4
%exitcond = icmp eq i64 %add, 20
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
for.end: ; preds = %for.body
ret void
}
; CHECK-LABEL: @explicit_off(
define void @explicit_off(i32* noalias %a,
i32* noalias %b,
i32* noalias %c,
i32* noalias %d,
i32* noalias %e) {
entry:
br label %for.body
; EXPLICIT-NOT: for.body.ldist1:
for.body: ; preds = %for.body, %entry
%ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
%arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
%loadA = load i32, i32* %arrayidxA, align 4
%arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
%loadB = load i32, i32* %arrayidxB, align 4
%mulA = mul i32 %loadB, %loadA
%add = add nuw nsw i64 %ind, 1
%arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
store i32 %mulA, i32* %arrayidxA_plus_4, align 4
%arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
%loadD = load i32, i32* %arrayidxD, align 4
%arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
%loadE = load i32, i32* %arrayidxE, align 4
%mulC = mul i32 %loadD, %loadE
%arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
store i32 %mulC, i32* %arrayidxC, align 4
%exitcond = icmp eq i64 %add, 20
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2
for.end: ; preds = %for.body
ret void
}
; CHECK-LABEL: @default_distribute(
define void @default_distribute(i32* noalias %a,
i32* noalias %b,
i32* noalias %c,
i32* noalias %d,
i32* noalias %e) {
entry:
br label %for.body
; Verify the two distributed loops.
; DEFAULT_ON: for.body.ldist1:
; DEFAULT_OFF-NOT: for.body.ldist1:
for.body: ; preds = %for.body, %entry
%ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
%arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
%loadA = load i32, i32* %arrayidxA, align 4
%arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
%loadB = load i32, i32* %arrayidxB, align 4
%mulA = mul i32 %loadB, %loadA
%add = add nuw nsw i64 %ind, 1
%arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
store i32 %mulA, i32* %arrayidxA_plus_4, align 4
%arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
%loadD = load i32, i32* %arrayidxD, align 4
%arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
%loadE = load i32, i32* %arrayidxE, align 4
%mulC = mul i32 %loadD, %loadE
%arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
store i32 %mulC, i32* %arrayidxC, align 4
%exitcond = icmp eq i64 %add, 20
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
ret void
}
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.distribute.enable", i1 true}
!2 = distinct !{!2, !3}
!3 = !{!"llvm.loop.distribute.enable", i1 false}