forked from OSchip/llvm-project
SLPVectorizer: Add support for trees with external users.
For example: bar() { int a = A[i]; int b = A[i+1]; B[i] = a; B[i+1] = b; foo(a); <--- a is used outside the vectorized expression. } llvm-svn: 181648
This commit is contained in:
parent
0686e5cb05
commit
cdfb48d2fe
|
@ -243,6 +243,10 @@ int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
|
|||
LaneMap.clear();
|
||||
MultiUserVals.clear();
|
||||
MustScalarize.clear();
|
||||
MustExtract.clear();
|
||||
|
||||
// Find the location of the last root.
|
||||
unsigned LastRootIndex = InstrIdx[GetLastInstr(VL, VL.size())];
|
||||
|
||||
// Scan the tree and find which value is used by which lane, and which values
|
||||
// must be scalarized.
|
||||
|
@ -258,15 +262,31 @@ int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
|
|||
for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end();
|
||||
I != E; ++I) {
|
||||
if (LaneMap.find(*I) == LaneMap.end()) {
|
||||
MustScalarize.insert(*it);
|
||||
DEBUG(dbgs()<<"SLP: Adding " << **it <<
|
||||
" to MustScalarize because of an out of tree usage.\n");
|
||||
break;
|
||||
DEBUG(dbgs()<<"SLP: Instr " << **it << " has multiple users.\n");
|
||||
|
||||
// We don't have an ordering problem if the user is not in this basic
|
||||
// block.
|
||||
Instruction *Inst = cast<Instruction>(*I);
|
||||
if (Inst->getParent() == BB) {
|
||||
// We don't have an ordering problem if the user is after the last
|
||||
// root.
|
||||
unsigned Idx = InstrIdx[Inst];
|
||||
if (Idx < LastRootIndex) {
|
||||
MustScalarize.insert(*it);
|
||||
DEBUG(dbgs()<<"SLP: Adding to MustScalarize "
|
||||
"because of an unsafe out of tree usage.\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG(dbgs()<<"SLP: Adding to MustExtract "
|
||||
"because of a safe out of tree usage.\n");
|
||||
MustExtract.insert(*it);
|
||||
}
|
||||
if (Lane == -1) Lane = LaneMap[*I];
|
||||
if (Lane != LaneMap[*I]) {
|
||||
MustScalarize.insert(*it);
|
||||
DEBUG(dbgs()<<"Adding " << **it <<
|
||||
DEBUG(dbgs()<<"SLP: Adding " << **it <<
|
||||
" to MustScalarize because multiple lane use it: "
|
||||
<< Lane << " and " << LaneMap[*I] << ".\n");
|
||||
break;
|
||||
|
@ -456,6 +476,13 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
|
|||
}
|
||||
}
|
||||
|
||||
// Calculate the extract cost.
|
||||
unsigned ExternalUserExtractCost = 0;
|
||||
for (unsigned i = 0, e = VL.size(); i < e; ++i)
|
||||
if (MustExtract.count(VL[i]))
|
||||
ExternalUserExtractCost +=
|
||||
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
|
||||
|
||||
switch (Opcode) {
|
||||
case Instruction::ZExt:
|
||||
case Instruction::SExt:
|
||||
|
@ -469,7 +496,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
|
|||
case Instruction::Trunc:
|
||||
case Instruction::FPTrunc:
|
||||
case Instruction::BitCast: {
|
||||
int Cost = 0;
|
||||
int Cost = ExternalUserExtractCost;
|
||||
ValueList Operands;
|
||||
Type *SrcTy = VL0->getOperand(0)->getType();
|
||||
// Prepare the operand vector.
|
||||
|
@ -510,7 +537,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
|
|||
case Instruction::And:
|
||||
case Instruction::Or:
|
||||
case Instruction::Xor: {
|
||||
int Cost = 0;
|
||||
int Cost = ExternalUserExtractCost;
|
||||
// Calculate the cost of all of the operands.
|
||||
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
|
||||
ValueList Operands;
|
||||
|
@ -540,7 +567,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
|
|||
int ScalarLdCost = VecTy->getNumElements() *
|
||||
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
|
||||
int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
|
||||
return VecLdCost - ScalarLdCost;
|
||||
return VecLdCost - ScalarLdCost + ExternalUserExtractCost;
|
||||
}
|
||||
case Instruction::Store: {
|
||||
// We know that we can merge the stores. Calculate the cost.
|
||||
|
@ -556,7 +583,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
|
|||
}
|
||||
|
||||
int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1);
|
||||
return TotalCost;
|
||||
return TotalCost + ExternalUserExtractCost;
|
||||
}
|
||||
default:
|
||||
// Unable to vectorize unknown instructions.
|
||||
|
@ -588,10 +615,24 @@ Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) {
|
|||
|
||||
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) {
|
||||
Value *V = vectorizeTree_rec(VL, VF);
|
||||
|
||||
Instruction *LastInstr = GetLastInstr(VL, VL.size());
|
||||
IRBuilder<> Builder(LastInstr);
|
||||
for (ValueSet::iterator it = MustExtract.begin(), e = MustExtract.end();
|
||||
it != e; ++it) {
|
||||
Instruction *I = cast<Instruction>(*it);
|
||||
Value *Vec = VectorizedValues[I];
|
||||
assert(LaneMap.count(I) && "Unable to find the lane for the external use");
|
||||
Value *Idx = Builder.getInt32(LaneMap[I]);
|
||||
Value *Extract = Builder.CreateExtractElement(Vec, Idx);
|
||||
I->replaceAllUsesWith(Extract);
|
||||
}
|
||||
|
||||
// We moved some instructions around. We have to number them again
|
||||
// before we can do any analysis.
|
||||
numberInstructions();
|
||||
MustScalarize.clear();
|
||||
MustExtract.clear();
|
||||
return V;
|
||||
}
|
||||
|
||||
|
|
|
@ -127,6 +127,11 @@ private:
|
|||
/// NOTICE: The vectorization methods also use this set.
|
||||
ValueSet MustScalarize;
|
||||
|
||||
/// Contains values that have users outside of the vectorized graph.
|
||||
/// We need to generate extract instructions for these values.
|
||||
/// NOTICE: The vectorization methods also use this set.
|
||||
ValueSet MustExtract;
|
||||
|
||||
/// Contains a list of values that are used outside the current tree. This
|
||||
/// set must be reset between runs.
|
||||
ValueSet MultiUserVals;
|
||||
|
|
|
@ -41,7 +41,7 @@ entry:
|
|||
}
|
||||
|
||||
|
||||
; int foo_fail(int * restrict B, int * restrict A, int n, int m) {
|
||||
; int extr_user(int * restrict B, int * restrict A, int n, int m) {
|
||||
; B[0] = n * A[0] + m * A[0];
|
||||
; B[1] = n * A[1] + m * A[1];
|
||||
; B[2] = n * A[2] + m * A[2];
|
||||
|
@ -49,10 +49,11 @@ entry:
|
|||
; return A[0];
|
||||
; }
|
||||
|
||||
; CHECK: @foo_fail
|
||||
; CHECK-NOT: load <4 x i32>
|
||||
; CHECK: @extr_user
|
||||
; CHECK: store <4 x i32>
|
||||
; CHECK-NEXT: extractelement <4 x i32>
|
||||
; CHECK: ret
|
||||
define i32 @foo_fail(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) {
|
||||
define i32 @extr_user(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) {
|
||||
entry:
|
||||
%0 = load i32* %A, align 4
|
||||
%mul238 = add i32 %m, %n
|
||||
|
|
Loading…
Reference in New Issue