forked from OSchip/llvm-project
Revert "[Unroll] Implement a conservative and monotonically increasing cost tracking system during the full unroll heuristic analysis that avoids counting any instruction cost until that instruction becomes "live" through a side-effect or use outside the..."
This reverts commit r269388. It caused some bots to fail, I'm reverting it until I investigate the issue. llvm-svn: 269395
This commit is contained in:
parent
e58d6228dc
commit
9be3b8b9bb
|
@ -89,7 +89,6 @@ private:
|
|||
bool visitLoad(LoadInst &I);
|
||||
bool visitCastInst(CastInst &I);
|
||||
bool visitCmpInst(CmpInst &I);
|
||||
bool visitPHINode(PHINode &PN);
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -189,13 +189,3 @@ bool UnrolledInstAnalyzer::visitCmpInst(CmpInst &I) {
|
|||
|
||||
return Base::visitCmpInst(I);
|
||||
}
|
||||
|
||||
bool UnrolledInstAnalyzer::visitPHINode(PHINode &PN) {
|
||||
// Run base visitor first. This way we can gather some useful for later
|
||||
// analysis information.
|
||||
if (Base::visitPHINode(PN))
|
||||
return true;
|
||||
|
||||
// The loop induction PHI nodes are definitionally free.
|
||||
return PN.getParent() == L->getHeader();
|
||||
}
|
||||
|
|
|
@ -184,40 +184,6 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
|
|||
return UP;
|
||||
}
|
||||
|
||||
namespace {
|
||||
/// A struct to densely store the state of an instruction after unrolling at
|
||||
/// each iteration.
|
||||
///
|
||||
/// This is designed to work like a tuple of <Instruction *, int> for the
|
||||
/// purposes of hashing and lookup, but to be able to associate two boolean
|
||||
/// states with each key.
|
||||
struct UnrolledInstState {
|
||||
Instruction *I;
|
||||
int Iteration : 30;
|
||||
unsigned IsFree : 1;
|
||||
unsigned IsCounted : 1;
|
||||
};
|
||||
|
||||
/// Hashing and equality testing for a set of the instruction states.
|
||||
struct UnrolledInstStateKeyInfo {
|
||||
typedef DenseMapInfo<Instruction *> PtrInfo;
|
||||
typedef DenseMapInfo<std::pair<Instruction *, int>> PairInfo;
|
||||
static inline UnrolledInstState getEmptyKey() {
|
||||
return {PtrInfo::getEmptyKey(), 0, 0, 0};
|
||||
}
|
||||
static inline UnrolledInstState getTombstoneKey() {
|
||||
return {PtrInfo::getTombstoneKey(), 0, 0, 0};
|
||||
}
|
||||
static inline unsigned getHashValue(const UnrolledInstState &S) {
|
||||
return PairInfo::getHashValue({S.I, S.Iteration});
|
||||
}
|
||||
static inline bool isEqual(const UnrolledInstState &LHS,
|
||||
const UnrolledInstState &RHS) {
|
||||
return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct EstimatedUnrollCost {
|
||||
/// \brief The estimated cost after unrolling.
|
||||
|
@ -252,25 +218,18 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
|
|||
assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) &&
|
||||
"The unroll iterations max is too large!");
|
||||
|
||||
// Only analyze inner loops. We can't properly estimate cost of nested loops
|
||||
// and we won't visit inner loops again anyway.
|
||||
if (!L->empty())
|
||||
return None;
|
||||
|
||||
// Don't simulate loops with a big or unknown tripcount
|
||||
if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
|
||||
TripCount > UnrollMaxIterationsCountToAnalyze)
|
||||
return None;
|
||||
|
||||
SmallSetVector<BasicBlock *, 16> BBWorklist;
|
||||
SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
|
||||
DenseMap<Value *, Constant *> SimplifiedValues;
|
||||
SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
|
||||
|
||||
// The estimated cost of the unrolled form of the loop. We try to estimate
|
||||
// this by simplifying as much as we can while computing the estimate.
|
||||
int UnrolledCost = 0;
|
||||
|
||||
// We also track the estimated dynamic (that is, actually executed) cost in
|
||||
// the rolled form. This helps identify cases when the savings from unrolling
|
||||
// aren't just exposing dead control flows, but actual reduced dynamic
|
||||
|
@ -278,97 +237,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
|
|||
// unrolling.
|
||||
int RolledDynamicCost = 0;
|
||||
|
||||
// We track the simplification of each instruction in each iteration. We use
|
||||
// this to recursively merge costs into the unrolled cost on-demand so that
|
||||
// we don't count the cost of any dead code. This is essentially a map from
|
||||
// <instruction, int> to <bool, bool>, but stored as a densely packed struct.
|
||||
DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
|
||||
|
||||
// A small worklist used to accumulate cost of instructions from each
|
||||
// observable and reached root in the loop.
|
||||
SmallVector<Instruction *, 16> CostWorklist;
|
||||
|
||||
// PHI-used worklist used between iterations while accumulating cost.
|
||||
SmallVector<Instruction *, 4> PHIUsedList;
|
||||
|
||||
// Helper function to accumulate cost for instructions in the loop.
|
||||
auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
|
||||
assert(Iteration >= 0 && "Cannot have a negative iteration!");
|
||||
assert(CostWorklist.empty() && "Must start with an empty cost list");
|
||||
assert(PHIUsedList.empty() && "Must start with an empty phi used list");
|
||||
CostWorklist.push_back(&RootI);
|
||||
for (;; --Iteration) {
|
||||
do {
|
||||
Instruction *I = CostWorklist.pop_back_val();
|
||||
|
||||
// InstCostMap only uses I and Iteration as a key, the other two values
|
||||
// don't matter here.
|
||||
auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
|
||||
if (CostIter == InstCostMap.end())
|
||||
// If an input to a PHI node comes from a dead path through the loop
|
||||
// we may have no cost data for it here. What that actually means is
|
||||
// that it is free.
|
||||
continue;
|
||||
auto &Cost = *CostIter;
|
||||
if (Cost.IsCounted)
|
||||
// Already counted this instruction.
|
||||
continue;
|
||||
|
||||
// Mark that we are counting the cost of this instruction now.
|
||||
Cost.IsCounted = true;
|
||||
|
||||
// If this is a PHI node in the loop header, just add it to the PHI set.
|
||||
if (auto *PhiI = dyn_cast<PHINode>(I))
|
||||
if (PhiI->getParent() == L->getHeader()) {
|
||||
assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
|
||||
"inherently simplify during unrolling.");
|
||||
if (Iteration == 0)
|
||||
continue;
|
||||
|
||||
// Push the incoming value from the backedge into the PHI used list
|
||||
// if it is an in-loop instruction. We'll use this to populate the
|
||||
// cost worklist for the next iteration (as we count backwards).
|
||||
if (auto *OpI = dyn_cast<Instruction>(
|
||||
PhiI->getIncomingValueForBlock(L->getLoopLatch())))
|
||||
if (L->contains(OpI))
|
||||
PHIUsedList.push_back(OpI);
|
||||
continue;
|
||||
}
|
||||
|
||||
// First accumulate the cost of this instruction.
|
||||
if (!Cost.IsFree) {
|
||||
UnrolledCost += TTI.getUserCost(I);
|
||||
DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration
|
||||
<< "): ");
|
||||
DEBUG(I->dump());
|
||||
}
|
||||
|
||||
// We must count the cost of every operand which is not free,
|
||||
// recursively. If we reach a loop PHI node, simply add it to the set
|
||||
// to be considered on the next iteration (backwards!).
|
||||
for (Value *Op : I->operands()) {
|
||||
// Check whether this operand is free due to being a constant or
|
||||
// outside the loop.
|
||||
auto *OpI = dyn_cast<Instruction>(Op);
|
||||
if (!OpI || !L->contains(OpI))
|
||||
continue;
|
||||
|
||||
// Otherwise accumulate its cost.
|
||||
CostWorklist.push_back(OpI);
|
||||
}
|
||||
} while (!CostWorklist.empty());
|
||||
|
||||
if (PHIUsedList.empty())
|
||||
// We've exhausted the search.
|
||||
break;
|
||||
|
||||
assert(Iteration > 0 &&
|
||||
"Cannot track PHI-used values past the first iteration!");
|
||||
CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
|
||||
PHIUsedList.clear();
|
||||
}
|
||||
};
|
||||
|
||||
// Ensure that we don't violate the loop structure invariants relied on by
|
||||
// this analysis.
|
||||
assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
|
||||
|
@ -423,32 +291,22 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
|
|||
// it. We don't change the actual IR, just count optimization
|
||||
// opportunities.
|
||||
for (Instruction &I : *BB) {
|
||||
// Track this instruction's expected baseline cost when executing the
|
||||
// rolled loop form.
|
||||
RolledDynamicCost += TTI.getUserCost(&I);
|
||||
int InstCost = TTI.getUserCost(&I);
|
||||
|
||||
// Visit the instruction to analyze its loop cost after unrolling,
|
||||
// and if the visitor returns true, mark the instruction as free after
|
||||
// unrolling and continue.
|
||||
bool IsFree = Analyzer.visit(I);
|
||||
bool Inserted = InstCostMap.insert({&I, (int)Iteration, IsFree,
|
||||
/*IsCounted*/ false})
|
||||
.second;
|
||||
(void)Inserted;
|
||||
assert(Inserted && "Cannot have a state for an unvisited instruction!");
|
||||
// and if the visitor returns false, include this instruction in the
|
||||
// unrolled cost.
|
||||
if (!Analyzer.visit(I))
|
||||
UnrolledCost += InstCost;
|
||||
else {
|
||||
DEBUG(dbgs() << " " << I
|
||||
<< " would be simplified if loop is unrolled.\n");
|
||||
(void)0;
|
||||
}
|
||||
|
||||
if (IsFree)
|
||||
continue;
|
||||
|
||||
// If the instruction might have a side-effect recursively account for
|
||||
// the cost of it and all the instructions leading up to it.
|
||||
if (I.mayHaveSideEffects())
|
||||
AddCostRecursively(I, Iteration);
|
||||
|
||||
// Can't properly model a cost of a call.
|
||||
// FIXME: With a proper cost model we should be able to do it.
|
||||
if(isa<CallInst>(&I))
|
||||
return None;
|
||||
// Also track this instructions expected cost when executing the rolled
|
||||
// loop form.
|
||||
RolledDynamicCost += InstCost;
|
||||
|
||||
// If unrolled body turns out to be too big, bail out.
|
||||
if (UnrolledCost > MaxUnrolledLoopSize) {
|
||||
|
@ -477,8 +335,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
|
|||
cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0);
|
||||
if (L->contains(Succ))
|
||||
BBWorklist.insert(Succ);
|
||||
else
|
||||
ExitWorklist.insert({BB, Succ});
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -494,8 +350,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
|
|||
.getCaseSuccessor();
|
||||
if (L->contains(Succ))
|
||||
BBWorklist.insert(Succ);
|
||||
else
|
||||
ExitWorklist.insert({BB, Succ});
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -504,8 +358,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
|
|||
for (BasicBlock *Succ : successors(BB))
|
||||
if (L->contains(Succ))
|
||||
BBWorklist.insert(Succ);
|
||||
else
|
||||
ExitWorklist.insert({BB, Succ});
|
||||
}
|
||||
|
||||
// If we found no optimization opportunities on the first iteration, we
|
||||
|
@ -516,23 +368,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
|
|||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
while (!ExitWorklist.empty()) {
|
||||
BasicBlock *ExitingBB, *ExitBB;
|
||||
std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
|
||||
|
||||
for (Instruction &I : *ExitBB) {
|
||||
auto *PN = dyn_cast<PHINode>(&I);
|
||||
if (!PN)
|
||||
break;
|
||||
|
||||
Value *Op = PN->getIncomingValueForBlock(ExitingBB);
|
||||
if (auto *OpI = dyn_cast<Instruction>(Op))
|
||||
if (L->contains(OpI))
|
||||
AddCostRecursively(*OpI, TripCount - 1);
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "Analysis finished:\n"
|
||||
<< "UnrolledCost: " << UnrolledCost << ", "
|
||||
<< "RolledDynamicCost: " << RolledDynamicCost << "\n");
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=70 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=50 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
@unknown_global = internal unnamed_addr global [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16
|
||||
|
|
|
@ -1,38 +0,0 @@
|
|||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=60 | FileCheck %s
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
@known_constant = internal unnamed_addr constant [10 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0], align 16
|
||||
|
||||
; If a load becomes a constant after loop unrolling, we sometimes can simplify
|
||||
; CFG. This test verifies that we handle such cases.
|
||||
; After one operand in an instruction is constant-folded and the
|
||||
; instruction is simplified, the other operand might become dead.
|
||||
; In this test we have::
|
||||
; for i in 1..10:
|
||||
; r += A[i] * B[i]
|
||||
; A[i] is 0 almost at every iteration, so there is no need in loading B[i] at
|
||||
; all.
|
||||
|
||||
|
||||
; CHECK-LABEL: @unroll_dce
|
||||
; CHECK-NOT: br i1 %exitcond, label %for.end, label %for.body
|
||||
define i32 @unroll_dce(i32* noalias nocapture readonly %b) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body, %entry
|
||||
%iv.0 = phi i64 [ 0, %entry ], [ %iv.1, %for.body ]
|
||||
%r.0 = phi i32 [ 0, %entry ], [ %r.1, %for.body ]
|
||||
%arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32]* @known_constant, i64 0, i64 %iv.0
|
||||
%x1 = load i32, i32* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv.0
|
||||
%x2 = load i32, i32* %arrayidx2, align 4
|
||||
%mul = mul i32 %x1, %x2
|
||||
%r.1 = add i32 %mul, %r.0
|
||||
%iv.1 = add nuw nsw i64 %iv.0, 1
|
||||
%exitcond = icmp eq i64 %iv.1, 10
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
for.end: ; preds = %for.body
|
||||
ret i32 %r.1
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=60 | FileCheck %s
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=40 | FileCheck %s
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
; When examining gep-instructions we shouldn't consider them simplified if the
|
||||
|
|
|
@ -134,7 +134,6 @@ TEST(UnrollAnalyzerTest, OuterLoopSimplification) {
|
|||
" br label %outer.loop\n"
|
||||
"outer.loop:\n"
|
||||
" %iv.outer = phi i64 [ 0, %entry ], [ %iv.outer.next, %outer.loop.latch ]\n"
|
||||
" %iv.outer.next = add nuw nsw i64 %iv.outer, 1\n"
|
||||
" br label %inner.loop\n"
|
||||
"inner.loop:\n"
|
||||
" %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]\n"
|
||||
|
@ -142,6 +141,7 @@ TEST(UnrollAnalyzerTest, OuterLoopSimplification) {
|
|||
" %exitcond.inner = icmp eq i64 %iv.inner.next, 1000\n"
|
||||
" br i1 %exitcond.inner, label %outer.loop.latch, label %inner.loop\n"
|
||||
"outer.loop.latch:\n"
|
||||
" %iv.outer.next = add nuw nsw i64 %iv.outer, 1\n"
|
||||
" %exitcond.outer = icmp eq i64 %iv.outer.next, 40\n"
|
||||
" br i1 %exitcond.outer, label %exit, label %outer.loop\n"
|
||||
"exit:\n"
|
||||
|
@ -163,15 +163,11 @@ TEST(UnrollAnalyzerTest, OuterLoopSimplification) {
|
|||
BasicBlock *InnerBody = &*FI++;
|
||||
|
||||
BasicBlock::iterator BBI = Header->begin();
|
||||
BBI++;
|
||||
Instruction *Y1 = &*BBI;
|
||||
Instruction *Y1 = &*BBI++;
|
||||
BBI = InnerBody->begin();
|
||||
BBI++;
|
||||
Instruction *Y2 = &*BBI;
|
||||
Instruction *Y2 = &*BBI++;
|
||||
// Check that we can simplify IV of the outer loop, but can't simplify the IV
|
||||
// of the inner loop if we only know the iteration number of the outer loop.
|
||||
//
|
||||
// Y1 is %iv.outer.next, Y2 is %iv.inner.next
|
||||
auto I1 = SimplifiedValuesVector[0].find(Y1);
|
||||
EXPECT_TRUE(I1 != SimplifiedValuesVector[0].end());
|
||||
auto I2 = SimplifiedValuesVector[0].find(Y2);
|
||||
|
|
Loading…
Reference in New Issue