LoopVectorizer: Add a basic cost model which uses the VTTI interface.

llvm-svn: 166620
This commit is contained in:
Nadav Rotem 2012-10-24 20:36:32 +00:00
parent be20d43232
commit a721b21c64
11 changed files with 322 additions and 39 deletions

View File

@ -18,10 +18,13 @@
//
// This pass has three parts:
// 1. The main loop pass that drives the different parts.
// 2. LoopVectorizationLegality - A helper class that checks for the legality
// 2. LoopVectorizationLegality - A unit that checks for the legality
// of the vectorization.
// 3. SingleBlockLoopVectorizer - A helper class that performs the actual
// 3. SingleBlockLoopVectorizer - A unit that performs the actual
// widening of instructions.
// 4. LoopVectorizationCostModel - A unit that checks for the profitability
// of vectorization. It decides on the optimal vector width, which
// can be one, if vectorization is not profitable.
//===----------------------------------------------------------------------===//
//
// The reduction-variable vectorization is based on the paper:
@ -51,13 +54,14 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/TargetTransformInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@ -67,13 +71,14 @@
using namespace llvm;
static cl::opt<unsigned>
DefaultVectorizationFactor("default-loop-vectorize-width",
cl::init(4), cl::Hidden,
cl::desc("Set the default loop vectorization width"));
VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
cl::desc("Set the default vectorization width. Zero is autoselect."));
namespace {
// Forward declaration.
// Forward declarations.
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
@ -229,11 +234,10 @@ public:
/// of the reductions that were found in the loop.
typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
/// Returns the maximum vectorization factor that we *can* use to vectorize
/// this loop. This does not mean that it is profitable to vectorize this
/// loop, only that it is legal to do so. This may be a large number. We
/// can vectorize to any SIMD width below this number.
unsigned getLoopMaxVF();
/// Returns true if it is legal to vectorize this loop.
/// This does not mean that it is profitable to vectorize this
/// loop, only that it is legal to do so.
bool canVectorize();
/// Returns the Induction variable.
PHINode *getInduction() {return Induction;}
@ -286,6 +290,49 @@ private:
SmallPtrSet<Value*, 4> AllowedExit;
};
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
/// In many cases vectorization is not profitable. This can happen because
/// of a number of reasons. In this class we mainly attempt to predict
/// the expected speedup/slowdowns due to the supported instruction set.
/// We use the VectorTargetTransformInfo to query the different backends
/// for the cost of different operations.
class LoopVectorizationCostModel {
public:
/// C'tor.
LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl,
LoopVectorizationLegality *Leg,
const VectorTargetTransformInfo *Vtti):
TheLoop(Lp), SE(Se), DL(Dl), Legal(Leg), VTTI(Vtti) { }
/// Returns the most profitable vectorization factor for the loop that is
/// smaller or equal to the VF argument. This method checks every power
/// of two up to VF.
unsigned findBestVectorizationFactor(unsigned VF = 4);
private:
/// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different
/// vector widths. The cost that is returned is *not* normalized by
/// the factor width.
unsigned expectedCost(unsigned VF);
/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
unsigned getInstructionCost(Instruction *I, unsigned VF);
/// The loop that we evaluate.
Loop *TheLoop;
/// Scev analysis.
ScalarEvolution *SE;
/// DataLayout analysis.
DataLayout *DL;
/// Vectorization legality.
LoopVectorizationLegality *Legal;
/// Vector target information.
const VectorTargetTransformInfo *VTTI;
};
struct LoopVectorize : public LoopPass {
static char ID; // Pass identification, replacement for typeid
@ -296,6 +343,7 @@ struct LoopVectorize : public LoopPass {
ScalarEvolution *SE;
DataLayout *DL;
LoopInfo *LI;
TargetTransformInfo *TTI;
virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
// We only vectorize innermost loops.
@ -305,25 +353,42 @@ struct LoopVectorize : public LoopPass {
SE = &getAnalysis<ScalarEvolution>();
DL = getAnalysisIfAvailable<DataLayout>();
LI = &getAnalysis<LoopInfo>();
TTI = getAnalysisIfAvailable<TargetTransformInfo>();
DEBUG(dbgs() << "LV: Checking a loop in \"" <<
L->getHeader()->getParent()->getName() << "\"\n");
// Check if it is legal to vectorize the loop.
LoopVectorizationLegality LVL(L, SE, DL);
unsigned MaxVF = LVL.getLoopMaxVF();
// Check that we can vectorize this loop using the chosen vectorization
// width.
if (MaxVF < DefaultVectorizationFactor) {
DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing.\n");
return false;
}
DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
// Select the preffered vectorization factor.
unsigned VF = 1;
if (VectorizationFactor == 0) {
const VectorTargetTransformInfo *VTTI = 0;
if (TTI)
VTTI = TTI->getVectorTargetTransformInfo();
// Use the cost model.
LoopVectorizationCostModel CM(L, SE, DL, &LVL, VTTI);
VF = CM.findBestVectorizationFactor();
if (VF == 1) {
DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
return false;
}
} else {
// Use the user command flag.
VF = VectorizationFactor;
}
DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n");
// If we decided that it is *legal* to vectorizer the loop then do it.
SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, VF);
LB.vectorize(&LVL);
DEBUG(verifyFunction(*L->getHeader()->getParent()));
@ -656,6 +721,13 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
void
SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
//===------------------------------------------------===//
//
// Notice: any optimization or new instruction that go
// into the code below should be also be implemented in
// the cost-model.
//
//===------------------------------------------------===//
typedef SmallVector<PHINode*, 4> PhiVector;
BasicBlock &BB = *OrigLoop->getHeader();
Constant *Zero = ConstantInt::get(
@ -957,18 +1029,18 @@ void SingleBlockLoopVectorizer::cleanup() {
SE->forgetLoop(OrigLoop);
}
unsigned LoopVectorizationLegality::getLoopMaxVF() {
bool LoopVectorizationLegality::canVectorize() {
if (!TheLoop->getLoopPreheader()) {
assert(false && "No preheader!!");
DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
return 1;
return false;
}
// We can only vectorize single basic block loops.
unsigned NumBlocks = TheLoop->getNumBlocks();
if (NumBlocks != 1) {
DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
return 1;
return false;
}
// We need to have a loop header.
@ -978,22 +1050,22 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() {
// Go over each instruction and look at memory deps.
if (!canVectorizeBlock(*BB)) {
DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
return 1;
return false;
}
// ScalarEvolution needs to be able to find the exit count.
const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
if (ExitCount == SE->getCouldNotCompute()) {
DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
return 1;
return false;
}
DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
// Okay! We can vectorize. At this point we don't have any other mem analysis
// which may limit our maximum vectorization factor, so just return the
// maximum SIMD size.
return DefaultVectorizationFactor;
// which may limit our maximum vectorization factor, so just return true with
// no restrictions.
return true;
}
bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
@ -1323,6 +1395,177 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
return true;
}
unsigned
LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
if (!VTTI) {
DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
return 1;
}
float Cost = expectedCost(1);
unsigned Width = 1;
DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
for (unsigned i=2; i <= VF; i*=2) {
// Notice that the vector loop needs to be executed less times, so
// we need to divide the cost of the vector loops by the width of
// the vector elements.
float VectorCost = expectedCost(i) / (float)i;
DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
(int)VectorCost << ".\n");
if (VectorCost < Cost) {
Cost = VectorCost;
Width = i;
}
}
DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
return Width;
}
unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
// We can only estimate the cost of single basic block loops.
assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
BasicBlock *BB = TheLoop->getHeader();
unsigned Cost = 0;
// For each instruction in the old loop.
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
Instruction *Inst = it;
Cost += getInstructionCost(Inst, VF);
}
// Return the cost divided by VF, because we will be executing
// less iterations of the vector form.
return Cost;
}
unsigned
LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
assert(VTTI && "Invalid vector target transformation info");
switch (I->getOpcode()) {
case Instruction::Br: {
return VTTI->getInstrCost(I->getOpcode());
}
case Instruction::PHI:
// PHIs are handled the same as the binary instructions below.
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
Type *VTy = VectorType::get(I->getType(), VF);
return VTTI->getInstrCost(I->getOpcode(), VTy);
}
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
Type *VTy = VectorType::get(I->getType(), VF);
const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
Type *CondTy = SI->getCondition()->getType();
if (ScalarCond)
CondTy = VectorType::get(CondTy, VF);
return VTTI->getInstrCost(I->getOpcode(), VTy, CondTy);
}
case Instruction::ICmp:
case Instruction::FCmp: {
Type *VTy = VectorType::get(I->getOperand(0)->getType(), VF);
return VTTI->getInstrCost(I->getOpcode(), VTy);
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(I);
Type *VTy = VectorType::get(SI->getValueOperand()->getType(), VF);
// Scalarized stores.
if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
unsigned Cost = 0;
unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
// The cost of extracting from the vector value.
Cost += VF * ExtCost;
// The cost of the scalar stores.
Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType());
return Cost;
}
// Wide stores.
return VTTI->getMemoryOpCost(I->getOpcode(), VTy, SI->getAlignment(),
SI->getPointerAddressSpace());
}
case Instruction::Load: {
LoadInst *LI = cast<LoadInst>(I);
Type *VTy = VectorType::get(I->getType(), VF);
// Scalarized loads.
if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
unsigned Cost = 0;
unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
// The cost of inserting the loaded value into the result vector.
Cost += VF * InCost;
// The cost of the scalar stores.
Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType());
return Cost;
}
// Wide loads.
return VTTI->getMemoryOpCost(I->getOpcode(), VTy, LI->getAlignment(),
LI->getPointerAddressSpace());
}
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VectorType::get(I->getOperand(0)->getType(), VF);
Type *DstTy = VectorType::get(I->getType(), VF);
return VTTI->getInstrCost(I->getOpcode(), DstTy, SrcTy);
}
default: {
// We are scalarizing the instruction. Return the cost of the scalar
// instruction, plus the cost of insert and extract into vector
// elements, times the vector width.
unsigned Cost = 0;
Type *Ty = I->getType();
if (!Ty->isVoidTy()) {
Type *VTy = VectorType::get(Ty, VF);
unsigned InsCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
Cost += VF * (InsCost + ExtCost);
}
/// We don't have any information on the scalar instruction, but maybe
/// the target has.
/// TODO: This may be a target-specific intrinsic.
/// Need to add API for that.
Cost += VF * VTTI->getInstrCost(I->getOpcode(), Ty);
return Cost;
}
}// end of switch.
}
} // namespace
char LoopVectorize::ID = 0;

View File

@ -1,4 +1,4 @@
; RUN: opt < %s -loop-vectorize -dce
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce
; Check that we don't fall into an infinite loop.
define void @test() nounwind {

View File

@ -0,0 +1,40 @@
; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
@c = common global [2048 x i32] zeroinitializer, align 16
@b = common global [2048 x i32] zeroinitializer, align 16
@d = common global [2048 x i32] zeroinitializer, align 16
@a = common global [2048 x i32] zeroinitializer, align 16
; At this point the cost model is pretty bad and we are vectorizing the code below.
; TODO: This code should not be vectorized on x86.
;CHECK: cost_model_1
;CHECK: <4 x i32>
;CHECK: ret void
define void @cost_model_1() nounwind uwtable noinline ssp {
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%0 = shl nsw i64 %indvars.iv, 1
%arrayidx = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %0
%1 = load i32* %arrayidx, align 8
%idxprom1 = sext i32 %1 to i64
%arrayidx2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %idxprom1
%2 = load i32* %arrayidx2, align 4
%arrayidx4 = getelementptr inbounds [2048 x i32]* @d, i64 0, i64 %indvars.iv
%3 = load i32* %arrayidx4, align 4
%idxprom5 = sext i32 %3 to i64
%arrayidx6 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %idxprom5
store i32 %2, i32* %arrayidx6, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, 256
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
ret void
}

View File

@ -1,4 +1,4 @@
; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"

View File

@ -1,4 +1,4 @@
; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"

View File

@ -1,4 +1,4 @@
; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"

View File

@ -1,4 +1,4 @@
; RUN: opt < %s -loop-vectorize -instcombine -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -instcombine -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"

View File

@ -1,4 +1,4 @@
; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"

View File

@ -1,4 +1,4 @@
; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"

View File

@ -1,4 +1,4 @@
; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"

View File

@ -1,4 +1,4 @@
; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"