Begin incorporating target information into BBVectorize.

This is the first of several steps to incorporate information from the new
TargetTransformInfo infrastructure into BBVectorize. Two things are done here:

 1. Target information is used to determine if it is profitable to fuse two
    instructions. This means that the cost of the vector operation must not
    be more expensive than the cost of the two original operations. Pairs that
    are not profitable are no longer considered (because current cost information
    is incomplete, for intrinsics for example, equal-cost pairs are still
    considered).

 2. The 'cost savings' computed for the profitability check are also used to
    rank the DAGs that represent the potential vectorization plans. Specifically,
    for nodes of non-trivial depth, the cost savings is used as the node
    weight.

The next step will be to incorporate the shuffle costs into the DAG weighting;
this will give the edges of the DAG weights as well. Once that is done, when
target information is available, we should be able to dispense with the
depth heuristic.

llvm-svn: 166716
This commit is contained in:
Hal Finkel 2012-10-25 21:12:23 +00:00
parent 579042f71b
commit cbf9365f4c
3 changed files with 144 additions and 43 deletions

View File

@ -43,12 +43,17 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/ValueHandle.h"
#include "llvm/DataLayout.h"
#include "llvm/TargetTransformInfo.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <map>
using namespace llvm;
static cl::opt<bool>
IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false),
cl::Hidden, cl::desc("Ignore target information"));
static cl::opt<unsigned>
ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
cl::desc("The required chain depth for vectorization"));
@ -181,9 +186,13 @@ namespace {
DT = &P->getAnalysis<DominatorTree>();
SE = &P->getAnalysis<ScalarEvolution>();
TD = P->getAnalysisIfAvailable<DataLayout>();
TTI = IgnoreTargetInfo ? 0 :
P->getAnalysisIfAvailable<TargetTransformInfo>();
VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0;
}
typedef std::pair<Value *, Value *> ValuePair;
typedef std::pair<ValuePair, int> ValuePairWithCost;
typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
typedef std::pair<std::multimap<Value *, Value *>::iterator,
@ -196,6 +205,8 @@ namespace {
DominatorTree *DT;
ScalarEvolution *SE;
DataLayout *TD;
TargetTransformInfo *TTI;
const VectorTargetTransformInfo *VTTI;
// FIXME: const correct?
@ -204,6 +215,7 @@ namespace {
bool getCandidatePairs(BasicBlock &BB,
BasicBlock::iterator &Start,
std::multimap<Value *, Value *> &CandidatePairs,
DenseMap<ValuePair, int> &CandidatePairCostSavings,
std::vector<Value *> &PairableInsts, bool NonPow2Len);
void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
@ -216,6 +228,7 @@ namespace {
DenseSet<ValuePair> &PairableInstUsers);
void choosePairs(std::multimap<Value *, Value *> &CandidatePairs,
DenseMap<ValuePair, int> &CandidatePairCostSavings,
std::vector<Value *> &PairableInsts,
std::multimap<ValuePair, ValuePair> &ConnectedPairs,
DenseSet<ValuePair> &PairableInstUsers,
@ -228,7 +241,8 @@ namespace {
bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
bool areInstsCompatible(Instruction *I, Instruction *J,
bool IsSimpleLoadStore, bool NonPow2Len);
bool IsSimpleLoadStore, bool NonPow2Len,
int &CostSavings);
bool trackUsesOfI(DenseSet<Value *> &Users,
AliasSetTracker &WriteSet, Instruction *I,
@ -270,13 +284,14 @@ namespace {
void findBestTreeFor(
std::multimap<Value *, Value *> &CandidatePairs,
DenseMap<ValuePair, int> &CandidatePairCostSavings,
std::vector<Value *> &PairableInsts,
std::multimap<ValuePair, ValuePair> &ConnectedPairs,
DenseSet<ValuePair> &PairableInstUsers,
std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
DenseMap<Value *, Value *> &ChosenPairs,
DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
size_t &BestEffSize, VPIteratorPair ChoiceRange,
int &BestEffSize, VPIteratorPair ChoiceRange,
bool UseCycleCheck);
Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
@ -339,13 +354,16 @@ namespace {
return false;
}
DEBUG(if (VTTI) dbgs() << "BBV: using target information\n");
bool changed = false;
// Iterate a sufficient number of times to merge types of size 1 bit,
// then 2 bits, then 4, etc. up to half of the target vector width of the
// target vector register.
unsigned n = 1;
for (unsigned v = 2;
v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter);
(VTTI || v <= Config.VectorBits) &&
(!Config.MaxIter || n <= Config.MaxIter);
v *= 2, ++n) {
DEBUG(dbgs() << "BBV: fusing loop #" << n <<
" for " << BB.getName() << " in " <<
@ -375,6 +393,9 @@ namespace {
DT = &getAnalysis<DominatorTree>();
SE = &getAnalysis<ScalarEvolution>();
TD = getAnalysisIfAvailable<DataLayout>();
TTI = IgnoreTargetInfo ? 0 :
getAnalysisIfAvailable<TargetTransformInfo>();
VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0;
return vectorizeBB(BB);
}
@ -427,6 +448,10 @@ namespace {
T2 = cast<CastInst>(I)->getSrcTy();
else
T2 = T1;
if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
T2 = SI->getCondition()->getType();
}
}
// Returns the weight associated with the provided value. A chain of
@ -465,18 +490,25 @@ namespace {
// directly after J.
bool getPairPtrInfo(Instruction *I, Instruction *J,
Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
unsigned &IAddressSpace, unsigned &JAddressSpace,
int64_t &OffsetInElmts) {
OffsetInElmts = 0;
if (isa<LoadInst>(I)) {
IPtr = cast<LoadInst>(I)->getPointerOperand();
JPtr = cast<LoadInst>(J)->getPointerOperand();
IAlignment = cast<LoadInst>(I)->getAlignment();
JAlignment = cast<LoadInst>(J)->getAlignment();
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
LoadInst *LJ = cast<LoadInst>(J);
IPtr = LI->getPointerOperand();
JPtr = LJ->getPointerOperand();
IAlignment = LI->getAlignment();
JAlignment = LJ->getAlignment();
IAddressSpace = LI->getPointerAddressSpace();
JAddressSpace = LJ->getPointerAddressSpace();
} else {
IPtr = cast<StoreInst>(I)->getPointerOperand();
JPtr = cast<StoreInst>(J)->getPointerOperand();
IAlignment = cast<StoreInst>(I)->getAlignment();
JAlignment = cast<StoreInst>(J)->getAlignment();
StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
IPtr = SI->getPointerOperand();
JPtr = SJ->getPointerOperand();
IAlignment = SI->getAlignment();
JAlignment = SJ->getAlignment();
IAddressSpace = SI->getPointerAddressSpace();
JAddressSpace = SJ->getPointerAddressSpace();
}
const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
@ -562,7 +594,9 @@ namespace {
do {
std::vector<Value *> PairableInsts;
std::multimap<Value *, Value *> CandidatePairs;
DenseMap<ValuePair, int> CandidatePairCostSavings;
ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
CandidatePairCostSavings,
PairableInsts, NonPow2Len);
if (PairableInsts.empty()) continue;
@ -590,7 +624,8 @@ namespace {
// variables.
DenseMap<Value *, Value *> ChosenPairs;
choosePairs(CandidatePairs, PairableInsts, ConnectedPairs,
choosePairs(CandidatePairs, CandidatePairCostSavings,
PairableInsts, ConnectedPairs,
PairableInstUsers, ChosenPairs);
if (ChosenPairs.empty()) continue;
@ -679,15 +714,22 @@ namespace {
!(VectorType::isValidElementType(T2) || T2->isVectorTy()))
return false;
if (T1->getScalarSizeInBits() == 1 && T2->getScalarSizeInBits() == 1) {
if (T1->getScalarSizeInBits() == 1) {
if (!Config.VectorizeBools)
return false;
} else {
if (!Config.VectorizeInts
&& (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
return false;
}
if (T2->getScalarSizeInBits() == 1) {
if (!Config.VectorizeBools)
return false;
} else {
if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
return false;
}
if (!Config.VectorizeFloats
&& (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
return false;
@ -703,8 +745,8 @@ namespace {
T2->getScalarType()->isPointerTy()))
return false;
if (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
T2->getPrimitiveSizeInBits() >= Config.VectorBits)
if (!VTTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
T2->getPrimitiveSizeInBits() >= Config.VectorBits))
return false;
return true;
@ -715,10 +757,13 @@ namespace {
// that I has already been determined to be vectorizable and that J is not
// in the use tree of I.
bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
bool IsSimpleLoadStore, bool NonPow2Len) {
bool IsSimpleLoadStore, bool NonPow2Len,
int &CostSavings) {
DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
" <-> " << *J << "\n");
CostSavings = 0;
// Loads and stores can be merged if they have different alignments,
// but are otherwise the same.
if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
@ -731,38 +776,62 @@ namespace {
unsigned MaxTypeBits = std::max(
IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
if (MaxTypeBits > Config.VectorBits)
if (!VTTI && MaxTypeBits > Config.VectorBits)
return false;
// FIXME: handle addsub-type operations!
if (IsSimpleLoadStore) {
Value *IPtr, *JPtr;
unsigned IAlignment, JAlignment;
unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
int64_t OffsetInElmts = 0;
if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
IAddressSpace, JAddressSpace,
OffsetInElmts) && abs64(OffsetInElmts) == 1) {
if (Config.AlignedOnly) {
Type *aTypeI = isa<StoreInst>(I) ?
cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
Type *aTypeJ = isa<StoreInst>(J) ?
cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
unsigned BottomAlignment = IAlignment;
if (OffsetInElmts < 0) BottomAlignment = JAlignment;
Type *aTypeI = isa<StoreInst>(I) ?
cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
Type *aTypeJ = isa<StoreInst>(J) ?
cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
if (Config.AlignedOnly) {
// An aligned load or store is possible only if the instruction
// with the lower offset has an alignment suitable for the
// vector type.
unsigned BottomAlignment = IAlignment;
if (OffsetInElmts < 0) BottomAlignment = JAlignment;
Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
if (BottomAlignment < VecAlignment)
return false;
}
if (VTTI) {
unsigned ICost = VTTI->getMemoryOpCost(I->getOpcode(), I->getType(),
IAlignment, IAddressSpace);
unsigned JCost = VTTI->getMemoryOpCost(J->getOpcode(), J->getType(),
JAlignment, JAddressSpace);
unsigned VCost = VTTI->getMemoryOpCost(I->getOpcode(), VType,
BottomAlignment,
IAddressSpace);
if (VCost > ICost + JCost)
return false;
CostSavings = ICost + JCost - VCost;
}
} else {
return false;
}
} else if (VTTI) {
unsigned ICost = VTTI->getInstrCost(I->getOpcode(), IT1, IT2);
unsigned JCost = VTTI->getInstrCost(J->getOpcode(), JT1, JT2);
Type *VT1 = getVecTypeForPair(IT1, JT1),
*VT2 = getVecTypeForPair(IT2, JT2);
unsigned VCost = VTTI->getInstrCost(I->getOpcode(), VT1, VT2);
if (VCost > ICost + JCost)
return false;
CostSavings = ICost + JCost - VCost;
}
// The powi intrinsic is special because only the first argument is
@ -845,6 +914,7 @@ namespace {
bool BBVectorize::getCandidatePairs(BasicBlock &BB,
BasicBlock::iterator &Start,
std::multimap<Value *, Value *> &CandidatePairs,
DenseMap<ValuePair, int> &CandidatePairCostSavings,
std::vector<Value *> &PairableInsts, bool NonPow2Len) {
BasicBlock::iterator E = BB.end();
if (Start == E) return false;
@ -881,7 +951,9 @@ namespace {
// J does not use I, and comes before the first use of I, so it can be
// merged with I if the instructions are compatible.
if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len)) continue;
int CostSavings;
if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len,
CostSavings)) continue;
// J is a candidate for merging with I.
if (!PairableInsts.size() ||
@ -890,6 +962,9 @@ namespace {
}
CandidatePairs.insert(ValuePair(I, J));
if (VTTI)
CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
CostSavings));
// The next call to this function must start after the last instruction
// selected during this invocation.
@ -899,7 +974,8 @@ namespace {
}
DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
<< *I << " <-> " << *J << "\n");
<< *I << " <-> " << *J << " (cost savings: " <<
CostSavings << ")\n");
// If we have already found too many pairs, break here and this function
// will be called again starting after the last instruction selected
@ -1353,13 +1429,14 @@ namespace {
// pairs, given the choice of root pairs as an iterator range.
void BBVectorize::findBestTreeFor(
std::multimap<Value *, Value *> &CandidatePairs,
DenseMap<ValuePair, int> &CandidatePairCostSavings,
std::vector<Value *> &PairableInsts,
std::multimap<ValuePair, ValuePair> &ConnectedPairs,
DenseSet<ValuePair> &PairableInstUsers,
std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
DenseMap<Value *, Value *> &ChosenPairs,
DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
size_t &BestEffSize, VPIteratorPair ChoiceRange,
int &BestEffSize, VPIteratorPair ChoiceRange,
bool UseCycleCheck) {
for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first;
J != ChoiceRange.second; ++J) {
@ -1409,17 +1486,26 @@ namespace {
PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree,
PrunedTree, *J, UseCycleCheck);
size_t EffSize = 0;
for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
E = PrunedTree.end(); S != E; ++S)
EffSize += getDepthFactor(S->first);
int EffSize = 0;
if (VTTI) {
for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
E = PrunedTree.end(); S != E; ++S) {
if (getDepthFactor(S->first))
EffSize += CandidatePairCostSavings.find(*S)->second;
}
} else {
for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
E = PrunedTree.end(); S != E; ++S)
EffSize += (int) getDepthFactor(S->first);
}
DEBUG(if (DebugPairSelection)
dbgs() << "BBV: found pruned Tree for pair {"
<< *J->first << " <-> " << *J->second << "} of depth " <<
MaxDepth << " and size " << PrunedTree.size() <<
" (effective size: " << EffSize << ")\n");
if (MaxDepth >= Config.ReqChainDepth && EffSize > BestEffSize) {
if (MaxDepth >= Config.ReqChainDepth &&
EffSize > 0 && EffSize > BestEffSize) {
BestMaxDepth = MaxDepth;
BestEffSize = EffSize;
BestTree = PrunedTree;
@ -1431,6 +1517,7 @@ namespace {
// that will be fused into vector instructions.
void BBVectorize::choosePairs(
std::multimap<Value *, Value *> &CandidatePairs,
DenseMap<ValuePair, int> &CandidatePairCostSavings,
std::vector<Value *> &PairableInsts,
std::multimap<ValuePair, ValuePair> &ConnectedPairs,
DenseSet<ValuePair> &PairableInstUsers,
@ -1447,9 +1534,11 @@ namespace {
VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I);
// The best pair to choose and its tree:
size_t BestMaxDepth = 0, BestEffSize = 0;
size_t BestMaxDepth = 0;
int BestEffSize = 0;
DenseSet<ValuePair> BestTree;
findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
findBestTreeFor(CandidatePairs, CandidatePairCostSavings,
PairableInsts, ConnectedPairs,
PairableInstUsers, PairableInstUserMap, ChosenPairs,
BestTree, BestMaxDepth, BestEffSize, ChoiceRange,
UseCycleCheck);
@ -1505,12 +1594,13 @@ namespace {
Instruction *I, Instruction *J, unsigned o,
bool FlipMemInputs) {
Value *IPtr, *JPtr;
unsigned IAlignment, JAlignment;
unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
int64_t OffsetInElmts;
// Note: the analysis might fail here, that is why FlipMemInputs has
// been precomputed (OffsetInElmts must be unused here).
(void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
IAddressSpace, JAddressSpace,
OffsetInElmts);
// The pointer value is taken to be the one with the lowest offset.
@ -2212,9 +2302,10 @@ namespace {
continue;
Value *IPtr, *JPtr;
unsigned IAlignment, JAlignment;
unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
int64_t OffsetInElmts;
if (!getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
IAddressSpace, JAddressSpace,
OffsetInElmts) || abs64(OffsetInElmts) != 1)
llvm_unreachable("Pre-fusion pointer analysis failed");

View File

@ -1,8 +1,11 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
; RUN: opt < %s -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
; The second check covers the use of alias analysis (with loop unrolling).
; Both checks are run with and without target information.
define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
entry:

View File

@ -1,5 +1,6 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-TI
; Basic depth-3 chain
define double @test1(double %A1, double %A2, double %B1, double %B2) {
@ -23,6 +24,9 @@ define double @test1(double %A1, double %A2, double %B1, double %B2) {
; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
ret double %R
; CHECK: ret double %R
; CHECK-TI: @test1
; CHECK-TI: fsub <2 x double>
; CHECK-TI: ret double
}
; Basic depth-3 chain (last pair permuted)
@ -146,6 +150,9 @@ define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
; CHECK: %R = mul <8 x i8> %Q1.v.r1, %Q1.v.r2
ret <8 x i8> %R
; CHECK: ret <8 x i8> %R
; CHECK-TI: @test6
; CHECK-TI-NOT: sub <16 x i8>
; CHECK-TI: ret <8 x i8>
}