[LoopVectorizer] Refine loop vectorizer's register usage calculator by ignoring specific instructions.

(This is the second attempt to check in this patch: REQUIRES: asserts is added
to reg-usage.ll now.)

LoopVectorizationCostModel::calculateRegisterUsage() is used to estimate the
register usage for specific VFs. However, it takes into account many
instructions that won't be vectorized, such as induction variables,
GetElementPtr instruction, etc.. This makes the loop vectorizer too conservative
when choosing VF. In this patch, the induction variables that won't be
vectorized plus GetElementPtr instruction will be added to ValuesToIgnore set
so that their register usage won't be considered any more.


Differential revision: http://reviews.llvm.org/D15177

llvm-svn: 255460
This commit is contained in:
Cong Hou 2015-12-13 16:55:46 +00:00
parent 3e0c022aed
commit e6a210f50b
3 changed files with 176 additions and 32 deletions

View File

@ -1409,15 +1409,14 @@ private:
/// different operations. /// different operations.
class LoopVectorizationCostModel { class LoopVectorizationCostModel {
public: public:
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
LoopVectorizationLegality *Legal, LoopInfo *LI, LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI, const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI, DemandedBits *DB, const TargetLibraryInfo *TLI, DemandedBits *DB,
AssumptionCache *AC, const Function *F, AssumptionCache *AC, const Function *F,
const LoopVectorizeHints *Hints, const LoopVectorizeHints *Hints)
SmallPtrSetImpl<const Value *> &ValuesToIgnore) : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), TheFunction(F), Hints(Hints) {}
TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {}
/// Information about vectorization costs /// Information about vectorization costs
struct VectorizationFactor { struct VectorizationFactor {
@ -1465,6 +1464,9 @@ public:
SmallVector<RegisterUsage, 8> SmallVector<RegisterUsage, 8>
calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs); calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);
/// Collect values we want to ignore in the cost model.
void collectValuesToIgnore();
private: private:
/// Returns the expected execution cost. The unit of the cost does /// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different /// not matter because we use the 'cost' units to compare different
@ -1496,8 +1498,8 @@ public:
/// The loop that we evaluate. /// The loop that we evaluate.
Loop *TheLoop; Loop *TheLoop;
/// Scev analysis. /// Predicated scalar evolution analysis.
ScalarEvolution *SE; PredicatedScalarEvolution &PSE;
/// Loop Info analysis. /// Loop Info analysis.
LoopInfo *LI; LoopInfo *LI;
/// Vectorization legality. /// Vectorization legality.
@ -1506,13 +1508,17 @@ public:
const TargetTransformInfo &TTI; const TargetTransformInfo &TTI;
/// Target Library Info. /// Target Library Info.
const TargetLibraryInfo *TLI; const TargetLibraryInfo *TLI;
/// Demanded bits analysis /// Demanded bits analysis.
DemandedBits *DB; DemandedBits *DB;
/// Assumption cache.
AssumptionCache *AC;
const Function *TheFunction; const Function *TheFunction;
// Loop Vectorize Hint. /// Loop Vectorize Hint.
const LoopVectorizeHints *Hints; const LoopVectorizeHints *Hints;
// Values to ignore in the cost model. /// Values to ignore in the cost model.
const SmallPtrSetImpl<const Value *> &ValuesToIgnore; SmallPtrSet<const Value *, 16> ValuesToIgnore;
/// Values to ignore in the cost model when VF > 1.
SmallPtrSet<const Value *, 16> VecValuesToIgnore;
}; };
/// \brief This holds vectorization requirements that must be verified late in /// \brief This holds vectorization requirements that must be verified late in
@ -1757,19 +1763,10 @@ struct LoopVectorize : public FunctionPass {
return false; return false;
} }
// Collect values we want to ignore in the cost model. This includes
// type-promoting instructions we identified during reduction detection.
SmallPtrSet<const Value *, 32> ValuesToIgnore;
CodeMetrics::collectEphemeralValues(L, AC, ValuesToIgnore);
for (auto &Reduction : *LVL.getReductionVars()) {
RecurrenceDescriptor &RedDes = Reduction.second;
SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
ValuesToIgnore.insert(Casts.begin(), Casts.end());
}
// Use the cost model. // Use the cost model.
LoopVectorizationCostModel CM(L, PSE.getSE(), LI, &LVL, *TTI, TLI, DB, AC, LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F,
F, &Hints, ValuesToIgnore); &Hints);
CM.collectValuesToIgnore();
// Check the function attributes to find out if this function should be // Check the function attributes to find out if this function should be
// optimized for size. // optimized for size.
@ -4737,7 +4734,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
} }
// Find the trip count. // Find the trip count.
unsigned TC = SE->getSmallConstantTripCount(TheLoop); unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
@ -4939,7 +4936,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
return 1; return 1;
// Do not interleave loops with a relatively small trip count. // Do not interleave loops with a relatively small trip count.
unsigned TC = SE->getSmallConstantTripCount(TheLoop); unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
if (TC > 1 && TC < TinyTripCountInterleaveThreshold) if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
return 1; return 1;
@ -5167,15 +5164,15 @@ LoopVectorizationCostModel::calculateRegisterUsage(
// Ignore instructions that are never used within the loop. // Ignore instructions that are never used within the loop.
if (!Ends.count(I)) continue; if (!Ends.count(I)) continue;
// Skip ignored values.
if (ValuesToIgnore.count(I))
continue;
// Remove all of the instructions that end at this location. // Remove all of the instructions that end at this location.
InstrList &List = TransposeEnds[i]; InstrList &List = TransposeEnds[i];
for (unsigned int j = 0, e = List.size(); j < e; ++j) for (unsigned int j = 0, e = List.size(); j < e; ++j)
OpenIntervals.erase(List[j]); OpenIntervals.erase(List[j]);
// Skip ignored values.
if (ValuesToIgnore.count(I))
continue;
// For each VF find the maximum usage of registers. // For each VF find the maximum usage of registers.
for (unsigned j = 0, e = VFs.size(); j < e; ++j) { for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
if (VFs[j] == 1) { if (VFs[j] == 1) {
@ -5185,8 +5182,12 @@ LoopVectorizationCostModel::calculateRegisterUsage(
// Count the number of live intervals. // Count the number of live intervals.
unsigned RegUsage = 0; unsigned RegUsage = 0;
for (auto Inst : OpenIntervals) for (auto Inst : OpenIntervals) {
// Skip ignored values for VF > 1.
if (VecValuesToIgnore.count(Inst))
continue;
RegUsage += GetRegUsage(Inst->getType(), VFs[j]); RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
}
MaxUsages[j] = std::max(MaxUsages[j], RegUsage); MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
} }
@ -5330,6 +5331,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
if (VF > 1 && MinBWs.count(I)) if (VF > 1 && MinBWs.count(I))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
Type *VectorTy = ToVectorTy(RetTy, VF); Type *VectorTy = ToVectorTy(RetTy, VF);
auto SE = PSE.getSE();
// TODO: We need to estimate the cost of intrinsic calls. // TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode()) { switch (I->getOpcode()) {
@ -5631,6 +5633,79 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
return false; return false;
} }
void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
// Ignore type-promoting instructions we identified during reduction
// detection.
for (auto &Reduction : *Legal->getReductionVars()) {
RecurrenceDescriptor &RedDes = Reduction.second;
SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
// Ignore induction phis that are only used in either GetElementPtr or ICmp
// instruction to exit loop. Induction variables usually have large types and
// can have big impact when estimating register usage.
// This is for when VF > 1.
for (auto &Induction : *Legal->getInductionVars()) {
auto *PN = Induction.first;
auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());
// Check that the PHI is only used by the induction increment (UpdateV) or
// by GEPs. Then check that UpdateV is only used by a compare instruction or
// the loop header PHI.
// FIXME: Need precise def-use analysis to determine if this instruction
// variable will be vectorized.
if (std::all_of(PN->user_begin(), PN->user_end(),
[&](const User *U) -> bool {
return U == UpdateV || isa<GetElementPtrInst>(U);
}) &&
std::all_of(UpdateV->user_begin(), UpdateV->user_end(),
[&](const User *U) -> bool {
return U == PN || isa<ICmpInst>(U);
})) {
VecValuesToIgnore.insert(PN);
VecValuesToIgnore.insert(UpdateV);
}
}
// Ignore instructions that will not be vectorized.
// This is for when VF > 1.
for (auto bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be;
++bb) {
for (auto &Inst : **bb) {
switch (Inst.getOpcode()) {
case Instruction::GetElementPtr: {
// Ignore GEP if its last operand is an induction variable so that it is
// a consecutive load/store and won't be vectorized as scatter/gather
// pattern.
GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst);
unsigned NumOperands = Gep->getNumOperands();
unsigned InductionOperand = getGEPInductionOperand(Gep);
bool GepToIgnore = true;
// Check that all of the gep indices are uniform except for the
// induction operand.
for (unsigned i = 0; i != NumOperands; ++i) {
if (i != InductionOperand &&
!PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
TheLoop)) {
GepToIgnore = false;
break;
}
}
if (GepToIgnore)
VecValuesToIgnore.insert(&Inst);
break;
}
}
}
}
}
void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
bool IfPredicateStore) { bool IfPredicateStore) {

View File

@ -16,7 +16,7 @@ target triple = "x86_64-unknown-linux-gnu"
; -vectorizer-maximize-bandwidth is indicated. ; -vectorizer-maximize-bandwidth is indicated.
; ;
; CHECK-label: foo ; CHECK-label: foo
; CHECK: LV: Selecting VF: 16. ; CHECK: LV: Selecting VF: 32.
define void @foo() { define void @foo() {
entry: entry:
br label %for.body br label %for.body

View File

@ -0,0 +1,69 @@
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gpu -mcpu=core2 -mattr=+sse2 -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth 2>&1 | FileCheck %s
; REQUIRES: asserts
@a = global [1024 x i8] zeroinitializer, align 16
@b = global [1024 x i8] zeroinitializer, align 16
define i32 @foo() {
; This function has a loop of SAD pattern. Here we check when VF = 16 the
; register usage doesn't exceed 16.
;
; CHECK-LABEL: foo
; CHECK: LV(REG): VF = 4
; CHECK-NEXT: LV(REG): Found max usage: 4
; CHECK: LV(REG): VF = 8
; CHECK-NEXT: LV(REG): Found max usage: 7
; CHECK: LV(REG): VF = 16
; CHECK-NEXT: LV(REG): Found max usage: 13
entry:
br label %for.body
for.cond.cleanup:
%add.lcssa = phi i32 [ %add, %for.body ]
ret i32 %add.lcssa
for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %indvars.iv
%0 = load i8, i8* %arrayidx, align 1
%conv = zext i8 %0 to i32
%arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %indvars.iv
%1 = load i8, i8* %arrayidx2, align 1
%conv3 = zext i8 %1 to i32
%sub = sub nsw i32 %conv, %conv3
%ispos = icmp sgt i32 %sub, -1
%neg = sub nsw i32 0, %sub
%2 = select i1 %ispos, i32 %sub, i32 %neg
%add = add nsw i32 %2, %s.015
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1024
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
define i64 @bar(i64* nocapture %a) {
; CHECK-LABEL: bar
; CHECK: LV(REG): VF = 2
; CHECK: LV(REG): Found max usage: 4
;
entry:
br label %for.body
for.cond.cleanup:
%add2.lcssa = phi i64 [ %add2, %for.body ]
ret i64 %add2.lcssa
for.body:
%i.012 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
%s.011 = phi i64 [ 0, %entry ], [ %add2, %for.body ]
%arrayidx = getelementptr inbounds i64, i64* %a, i64 %i.012
%0 = load i64, i64* %arrayidx, align 8
%add = add nsw i64 %0, %i.012
store i64 %add, i64* %arrayidx, align 8
%add2 = add nsw i64 %add, %s.011
%inc = add nuw nsw i64 %i.012, 1
%exitcond = icmp eq i64 %inc, 1024
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}