forked from OSchip/llvm-project
[SLP]Further improvement of the cost model for scalars used in buildvectors.
Further improvement of the cost model for the scalars used in buildvectors sequences. The main functionality is outlined into a separate function. The cost is calculated in the following way: 1. If the Base vector is not undef vector, resizing the very first mask to have common VF and perform action for 2 input vectors (including non-undef Base). Other shuffle masks are combined with the resulting after the 1 stage and processed as a shuffle of 2 elements. 2. If the Base is undef vector and have only 1 shuffle mask, perform the action only for 1 vector with the given mask, if it is not the identity mask. 3. If > 2 masks are used, perform serie of shuffle actions for 2 vectors, combing the masks properly between the steps. The original implementation misses the very first analysis for the Base vector, so the cost might too optimistic in some cases. But it improves the cost for the insertelements which are part of the current SLP graph. Part of D107966. Differential Revision: https://reviews.llvm.org/D115750
This commit is contained in:
parent
400587ba0c
commit
f5d45d70a5
|
@ -6579,6 +6579,126 @@ static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
|
||||||
|
/// buildvector sequence.
|
||||||
|
static bool isFirstInsertElement(const InsertElementInst *IE1,
|
||||||
|
const InsertElementInst *IE2) {
|
||||||
|
const auto *I1 = IE1;
|
||||||
|
const auto *I2 = IE2;
|
||||||
|
const InsertElementInst *PrevI1;
|
||||||
|
const InsertElementInst *PrevI2;
|
||||||
|
do {
|
||||||
|
if (I2 == IE1)
|
||||||
|
return true;
|
||||||
|
if (I1 == IE2)
|
||||||
|
return false;
|
||||||
|
PrevI1 = I1;
|
||||||
|
PrevI2 = I2;
|
||||||
|
if (I1 && (I1 == IE1 || I1->hasOneUse()))
|
||||||
|
I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
|
||||||
|
if (I2 && (I2 == IE2 || I2->hasOneUse()))
|
||||||
|
I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
|
||||||
|
} while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
|
||||||
|
llvm_unreachable("Two different buildvectors not expected.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Does the analysis of the provided shuffle masks and performs the requested
|
||||||
|
/// actions on the vectors with the given shuffle masks. It tries to do it in
|
||||||
|
/// several steps.
|
||||||
|
/// 1. If the Base vector is not undef vector, resizing the very first mask to
|
||||||
|
/// have common VF and perform action for 2 input vectors (including non-undef
|
||||||
|
/// Base). Other shuffle masks are combined with the resulting after the 1 stage
|
||||||
|
/// and processed as a shuffle of 2 elements.
|
||||||
|
/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
|
||||||
|
/// action only for 1 vector with the given mask, if it is not the identity
|
||||||
|
/// mask.
|
||||||
|
/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
|
||||||
|
/// vectors, combing the masks properly between the steps.
|
||||||
|
template <typename T>
|
||||||
|
static T *performExtractsShuffleAction(
|
||||||
|
MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
|
||||||
|
function_ref<unsigned(T *)> GetVF,
|
||||||
|
function_ref<std::pair<T *, bool>(T *, ArrayRef<int>)> ResizeAction,
|
||||||
|
function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
|
||||||
|
assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
|
||||||
|
SmallVector<int> Mask(ShuffleMask.begin()->second);
|
||||||
|
auto VMIt = std::next(ShuffleMask.begin());
|
||||||
|
T *Prev = nullptr;
|
||||||
|
bool IsBaseNotUndef = !isUndefVector(Base);
|
||||||
|
if (IsBaseNotUndef) {
|
||||||
|
// Base is not undef, need to combine it with the next subvectors.
|
||||||
|
std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask);
|
||||||
|
for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
|
||||||
|
if (Mask[Idx] == UndefMaskElem)
|
||||||
|
Mask[Idx] = Idx;
|
||||||
|
else
|
||||||
|
Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
|
||||||
|
}
|
||||||
|
Prev = Action(Mask, {nullptr, Res.first});
|
||||||
|
} else if (ShuffleMask.size() == 1) {
|
||||||
|
// Base is undef and only 1 vector is shuffled - perform the action only for
|
||||||
|
// single vector, if the mask is not the identity mask.
|
||||||
|
std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask);
|
||||||
|
if (Res.second)
|
||||||
|
// Identity mask is found.
|
||||||
|
Prev = Res.first;
|
||||||
|
else
|
||||||
|
Prev = Action(Mask, {ShuffleMask.begin()->first});
|
||||||
|
} else {
|
||||||
|
// Base is undef and at least 2 input vectors shuffled - perform 2 vectors
|
||||||
|
// shuffles step by step, combining shuffle between the steps.
|
||||||
|
unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
|
||||||
|
unsigned Vec2VF = GetVF(VMIt->first);
|
||||||
|
if (Vec1VF == Vec2VF) {
|
||||||
|
// No need to resize the input vectors since they are of the same size, we
|
||||||
|
// can shuffle them directly.
|
||||||
|
ArrayRef<int> SecMask = VMIt->second;
|
||||||
|
for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
|
||||||
|
if (SecMask[I] != UndefMaskElem) {
|
||||||
|
assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");
|
||||||
|
Mask[I] = SecMask[I] + Vec1VF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
|
||||||
|
} else {
|
||||||
|
// Vectors of different sizes - resize and reshuffle.
|
||||||
|
std::pair<T *, bool> Res1 =
|
||||||
|
ResizeAction(ShuffleMask.begin()->first, Mask);
|
||||||
|
std::pair<T *, bool> Res2 = ResizeAction(VMIt->first, VMIt->second);
|
||||||
|
ArrayRef<int> SecMask = VMIt->second;
|
||||||
|
for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
|
||||||
|
if (Mask[I] != UndefMaskElem) {
|
||||||
|
assert(SecMask[I] == UndefMaskElem && "Multiple uses of scalars.");
|
||||||
|
if (Res1.second)
|
||||||
|
Mask[I] = I;
|
||||||
|
} else if (SecMask[I] != UndefMaskElem) {
|
||||||
|
assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");
|
||||||
|
Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Prev = Action(Mask, {Res1.first, Res2.first});
|
||||||
|
}
|
||||||
|
VMIt = std::next(VMIt);
|
||||||
|
}
|
||||||
|
// Perform requested actions for the remaining masks/vectors.
|
||||||
|
for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
|
||||||
|
// Shuffle other input vectors, if any.
|
||||||
|
std::pair<T *, bool> Res = ResizeAction(VMIt->first, VMIt->second);
|
||||||
|
ArrayRef<int> SecMask = VMIt->second;
|
||||||
|
for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
|
||||||
|
if (SecMask[I] != UndefMaskElem) {
|
||||||
|
assert((Mask[I] == UndefMaskElem || IsBaseNotUndef) &&
|
||||||
|
"Multiple uses of scalars.");
|
||||||
|
Mask[I] = (Res.second ? I : SecMask[I]) + VF;
|
||||||
|
} else if (Mask[I] != UndefMaskElem) {
|
||||||
|
Mask[I] = I;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Prev = Action(Mask, {Prev, Res.first});
|
||||||
|
}
|
||||||
|
return Prev;
|
||||||
|
}
|
||||||
|
|
||||||
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
|
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
|
||||||
InstructionCost Cost = 0;
|
InstructionCost Cost = 0;
|
||||||
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
|
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
|
||||||
|
@ -6599,9 +6719,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
|
||||||
|
|
||||||
SmallPtrSet<Value *, 16> ExtractCostCalculated;
|
SmallPtrSet<Value *, 16> ExtractCostCalculated;
|
||||||
InstructionCost ExtractCost = 0;
|
InstructionCost ExtractCost = 0;
|
||||||
SmallVector<unsigned> VF;
|
SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
|
||||||
SmallVector<SmallVector<int>> ShuffleMask;
|
SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
|
||||||
SmallVector<Value *> FirstUsers;
|
|
||||||
SmallVector<APInt> DemandedElts;
|
SmallVector<APInt> DemandedElts;
|
||||||
for (ExternalUser &EU : ExternalUses) {
|
for (ExternalUser &EU : ExternalUses) {
|
||||||
// We only add extract cost once for the same scalar.
|
// We only add extract cost once for the same scalar.
|
||||||
|
@ -6630,37 +6749,52 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
|
||||||
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
|
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
|
||||||
Optional<unsigned> InsertIdx = getInsertIndex(VU);
|
Optional<unsigned> InsertIdx = getInsertIndex(VU);
|
||||||
if (InsertIdx) {
|
if (InsertIdx) {
|
||||||
auto *It = find_if(FirstUsers, [VU](Value *V) {
|
const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
|
||||||
return areTwoInsertFromSameBuildVector(VU,
|
auto *It =
|
||||||
cast<InsertElementInst>(V));
|
find_if(FirstUsers,
|
||||||
|
[VU](const std::pair<Value *, const TreeEntry *> &Pair) {
|
||||||
|
return areTwoInsertFromSameBuildVector(
|
||||||
|
VU, cast<InsertElementInst>(Pair.first));
|
||||||
});
|
});
|
||||||
int VecId = -1;
|
int VecId = -1;
|
||||||
if (It == FirstUsers.end()) {
|
if (It == FirstUsers.end()) {
|
||||||
VF.push_back(FTy->getNumElements());
|
(void)ShuffleMasks.emplace_back();
|
||||||
ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
|
SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
|
||||||
|
if (Mask.empty())
|
||||||
|
Mask.assign(FTy->getNumElements(), UndefMaskElem);
|
||||||
// Find the insertvector, vectorized in tree, if any.
|
// Find the insertvector, vectorized in tree, if any.
|
||||||
Value *Base = VU;
|
Value *Base = VU;
|
||||||
while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
|
while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
|
||||||
|
if (IEBase != EU.User && !IEBase->hasOneUse())
|
||||||
|
break;
|
||||||
// Build the mask for the vectorized insertelement instructions.
|
// Build the mask for the vectorized insertelement instructions.
|
||||||
if (const TreeEntry *E = getTreeEntry(IEBase)) {
|
if (const TreeEntry *E = getTreeEntry(IEBase)) {
|
||||||
VU = IEBase;
|
VU = IEBase;
|
||||||
do {
|
do {
|
||||||
int Idx = E->findLaneForValue(Base);
|
IEBase = cast<InsertElementInst>(Base);
|
||||||
ShuffleMask.back()[Idx] = Idx;
|
int Idx = *getInsertIndex(IEBase);
|
||||||
Base = cast<InsertElementInst>(Base)->getOperand(0);
|
assert(Mask[Idx] == UndefMaskElem &&
|
||||||
|
"InsertElementInstruction used already.");
|
||||||
|
Mask[Idx] = Idx;
|
||||||
|
Base = IEBase->getOperand(0);
|
||||||
} while (E == getTreeEntry(Base));
|
} while (E == getTreeEntry(Base));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
Base = cast<InsertElementInst>(Base)->getOperand(0);
|
Base = cast<InsertElementInst>(Base)->getOperand(0);
|
||||||
}
|
}
|
||||||
FirstUsers.push_back(VU);
|
FirstUsers.emplace_back(VU, ScalarTE);
|
||||||
DemandedElts.push_back(APInt::getZero(VF.back()));
|
DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
|
||||||
VecId = FirstUsers.size() - 1;
|
VecId = FirstUsers.size() - 1;
|
||||||
} else {
|
} else {
|
||||||
|
if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
|
||||||
|
It->first = VU;
|
||||||
VecId = std::distance(FirstUsers.begin(), It);
|
VecId = std::distance(FirstUsers.begin(), It);
|
||||||
}
|
}
|
||||||
int InIdx = *InsertIdx;
|
int InIdx = *InsertIdx;
|
||||||
ShuffleMask[VecId][InIdx] = EU.Lane;
|
SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
|
||||||
|
if (Mask.empty())
|
||||||
|
Mask.assign(FTy->getNumElements(), UndefMaskElem);
|
||||||
|
Mask[InIdx] = EU.Lane;
|
||||||
DemandedElts[VecId].setBit(InIdx);
|
DemandedElts[VecId].setBit(InIdx);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -6687,90 +6821,76 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
|
||||||
|
|
||||||
InstructionCost SpillCost = getSpillCost();
|
InstructionCost SpillCost = getSpillCost();
|
||||||
Cost += SpillCost + ExtractCost;
|
Cost += SpillCost + ExtractCost;
|
||||||
if (FirstUsers.size() == 1) {
|
auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask) {
|
||||||
int Limit = ShuffleMask.front().size() * 2;
|
InstructionCost C = 0;
|
||||||
if (!all_of(ShuffleMask.front(),
|
unsigned VF = Mask.size();
|
||||||
[Limit](int Idx) { return Idx < Limit; }) ||
|
unsigned VecVF = TE->getVectorFactor();
|
||||||
!ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) {
|
if (VF != VecVF &&
|
||||||
InstructionCost C = TTI->getShuffleCost(
|
(any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
|
||||||
|
(all_of(Mask,
|
||||||
|
[VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&
|
||||||
|
!ShuffleVectorInst::isIdentityMask(Mask)))) {
|
||||||
|
SmallVector<int> OrigMask(VecVF, UndefMaskElem);
|
||||||
|
std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
|
||||||
|
OrigMask.begin());
|
||||||
|
C = TTI->getShuffleCost(
|
||||||
TTI::SK_PermuteSingleSrc,
|
TTI::SK_PermuteSingleSrc,
|
||||||
cast<FixedVectorType>(FirstUsers.front()->getType()),
|
FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);
|
||||||
ShuffleMask.front());
|
LLVM_DEBUG(
|
||||||
|
dbgs() << "SLP: Adding cost " << C
|
||||||
|
<< " for final shuffle of insertelement external users.\n";
|
||||||
|
TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
|
||||||
|
Cost += C;
|
||||||
|
return std::make_pair(TE, true);
|
||||||
|
}
|
||||||
|
return std::make_pair(TE, false);
|
||||||
|
};
|
||||||
|
// Calculate the cost of the reshuffled vectors, if any.
|
||||||
|
for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
|
||||||
|
Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
|
||||||
|
unsigned VF = ShuffleMasks[I].begin()->second.size();
|
||||||
|
auto *FTy = FixedVectorType::get(
|
||||||
|
cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF);
|
||||||
|
auto Vector = ShuffleMasks[I].takeVector();
|
||||||
|
auto &&EstimateShufflesCost = [this, FTy,
|
||||||
|
&Cost](ArrayRef<int> Mask,
|
||||||
|
ArrayRef<const TreeEntry *> TEs) {
|
||||||
|
assert((TEs.size() == 1 || TEs.size() == 2) &&
|
||||||
|
"Expected exactly 1 or 2 tree entries.");
|
||||||
|
if (TEs.size() == 1) {
|
||||||
|
int Limit = 2 * Mask.size();
|
||||||
|
if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) ||
|
||||||
|
!ShuffleVectorInst::isIdentityMask(Mask)) {
|
||||||
|
InstructionCost C =
|
||||||
|
TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);
|
||||||
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
|
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
|
||||||
<< " for final shuffle of insertelement external users "
|
<< " for final shuffle of insertelement "
|
||||||
<< *VectorizableTree.front()->Scalars.front() << ".\n"
|
"external users.\n";
|
||||||
<< "SLP: Current total cost = " << Cost << "\n");
|
TEs.front()->dump();
|
||||||
|
dbgs() << "SLP: Current total cost = " << Cost << "\n");
|
||||||
Cost += C;
|
Cost += C;
|
||||||
}
|
}
|
||||||
InstructionCost InsertCost = TTI->getScalarizationOverhead(
|
} else {
|
||||||
cast<FixedVectorType>(FirstUsers.front()->getType()),
|
|
||||||
DemandedElts.front(), /*Insert*/ true, /*Extract*/ false);
|
|
||||||
LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
|
|
||||||
<< " for insertelements gather.\n"
|
|
||||||
<< "SLP: Current total cost = " << Cost << "\n");
|
|
||||||
Cost -= InsertCost;
|
|
||||||
} else if (FirstUsers.size() >= 2) {
|
|
||||||
unsigned MaxVF = *std::max_element(VF.begin(), VF.end());
|
|
||||||
// Combined masks of the first 2 vectors.
|
|
||||||
SmallVector<int> CombinedMask(MaxVF, UndefMaskElem);
|
|
||||||
copy(ShuffleMask.front(), CombinedMask.begin());
|
|
||||||
APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF);
|
|
||||||
auto *VecTy = FixedVectorType::get(
|
|
||||||
cast<VectorType>(FirstUsers.front()->getType())->getElementType(),
|
|
||||||
MaxVF);
|
|
||||||
for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) {
|
|
||||||
if (ShuffleMask[1][I] != UndefMaskElem) {
|
|
||||||
CombinedMask[I] = ShuffleMask[1][I] + MaxVF;
|
|
||||||
CombinedDemandedElts.setBit(I);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
InstructionCost C =
|
InstructionCost C =
|
||||||
TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
|
TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask);
|
||||||
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
|
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
|
||||||
<< " for final shuffle of vector node and external "
|
<< " for final shuffle of vector node and external "
|
||||||
"insertelement users "
|
"insertelement users.\n";
|
||||||
<< *VectorizableTree.front()->Scalars.front() << ".\n"
|
if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
|
||||||
<< "SLP: Current total cost = " << Cost << "\n");
|
dbgs() << "SLP: Current total cost = " << Cost << "\n");
|
||||||
Cost += C;
|
Cost += C;
|
||||||
InstructionCost InsertCost = TTI->getScalarizationOverhead(
|
|
||||||
VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false);
|
|
||||||
LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
|
|
||||||
<< " for insertelements gather.\n"
|
|
||||||
<< "SLP: Current total cost = " << Cost << "\n");
|
|
||||||
Cost -= InsertCost;
|
|
||||||
for (int I = 2, E = FirstUsers.size(); I < E; ++I) {
|
|
||||||
if (ShuffleMask[I].empty())
|
|
||||||
continue;
|
|
||||||
// Other elements - permutation of 2 vectors (the initial one and the
|
|
||||||
// next Ith incoming vector).
|
|
||||||
unsigned VF = ShuffleMask[I].size();
|
|
||||||
for (unsigned Idx = 0; Idx < VF; ++Idx) {
|
|
||||||
int Mask = ShuffleMask[I][Idx];
|
|
||||||
if (Mask != UndefMaskElem)
|
|
||||||
CombinedMask[Idx] = MaxVF + Mask;
|
|
||||||
else if (CombinedMask[Idx] != UndefMaskElem)
|
|
||||||
CombinedMask[Idx] = Idx;
|
|
||||||
}
|
}
|
||||||
for (unsigned Idx = VF; Idx < MaxVF; ++Idx)
|
return TEs.back();
|
||||||
if (CombinedMask[Idx] != UndefMaskElem)
|
};
|
||||||
CombinedMask[Idx] = Idx;
|
(void)performExtractsShuffleAction<const TreeEntry>(
|
||||||
InstructionCost C =
|
makeMutableArrayRef(Vector.data(), Vector.size()), Base,
|
||||||
TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
|
[](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
|
||||||
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
|
EstimateShufflesCost);
|
||||||
<< " for final shuffle of vector node and external "
|
|
||||||
"insertelement users "
|
|
||||||
<< *VectorizableTree.front()->Scalars.front() << ".\n"
|
|
||||||
<< "SLP: Current total cost = " << Cost << "\n");
|
|
||||||
Cost += C;
|
|
||||||
InstructionCost InsertCost = TTI->getScalarizationOverhead(
|
InstructionCost InsertCost = TTI->getScalarizationOverhead(
|
||||||
cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
|
cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
|
||||||
/*Insert*/ true, /*Extract*/ false);
|
/*Insert*/ true, /*Extract*/ false);
|
||||||
LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
|
|
||||||
<< " for insertelements gather.\n"
|
|
||||||
<< "SLP: Current total cost = " << Cost << "\n");
|
|
||||||
Cost -= InsertCost;
|
Cost -= InsertCost;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
SmallString<256> Str;
|
SmallString<256> Str;
|
||||||
|
@ -10376,8 +10496,9 @@ public:
|
||||||
for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
|
for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
|
||||||
if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
|
if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
|
||||||
continue;
|
continue;
|
||||||
if (VectorizedVals.count(Candidates[Cnt]))
|
unsigned NumOps = VectorizedVals.lookup(Candidates[Cnt]) +
|
||||||
continue;
|
std::count(VL.begin(), VL.end(), Candidates[Cnt]);
|
||||||
|
if (NumOps != ReducedValsToOps.find(Candidates[Cnt])->second.size())
|
||||||
LocalExternallyUsedValues[Candidates[Cnt]];
|
LocalExternallyUsedValues[Candidates[Cnt]];
|
||||||
}
|
}
|
||||||
V.buildExternalUses(LocalExternallyUsedValues);
|
V.buildExternalUses(LocalExternallyUsedValues);
|
||||||
|
@ -10480,10 +10601,7 @@ public:
|
||||||
auto TVIt = TrackedVals.find(RdxVal);
|
auto TVIt = TrackedVals.find(RdxVal);
|
||||||
if (TVIt != TrackedVals.end())
|
if (TVIt != TrackedVals.end())
|
||||||
StableRdxVal = TVIt->second;
|
StableRdxVal = TVIt->second;
|
||||||
unsigned NumOps = 0;
|
unsigned NumOps = VectorizedVals.lookup(RdxVal);
|
||||||
auto It = VectorizedVals.find(RdxVal);
|
|
||||||
if (It != VectorizedVals.end())
|
|
||||||
NumOps = It->second;
|
|
||||||
for (Instruction *RedOp :
|
for (Instruction *RedOp :
|
||||||
makeArrayRef(ReducedValsToOps.find(RdxVal)->second)
|
makeArrayRef(ReducedValsToOps.find(RdxVal)->second)
|
||||||
.drop_back(NumOps)) {
|
.drop_back(NumOps)) {
|
||||||
|
@ -10550,8 +10668,6 @@ public:
|
||||||
return VectorizedTree;
|
return VectorizedTree;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned numReductionValues() const { return ReducedVals.size(); }
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// Calculate the cost of a reduction.
|
/// Calculate the cost of a reduction.
|
||||||
InstructionCost getReductionCost(TargetTransformInfo *TTI,
|
InstructionCost getReductionCost(TargetTransformInfo *TTI,
|
||||||
|
|
|
@ -11,25 +11,27 @@ define void @_foo(double %p1, double %p2, double %p3) #0 {
|
||||||
; CHECK-NEXT: [[TAB2:%.*]] = alloca [256 x i32], align 16
|
; CHECK-NEXT: [[TAB2:%.*]] = alloca [256 x i32], align 16
|
||||||
; CHECK-NEXT: br label [[BB1:%.*]]
|
; CHECK-NEXT: br label [[BB1:%.*]]
|
||||||
; CHECK: bb1:
|
; CHECK: bb1:
|
||||||
; CHECK-NEXT: [[MUL19:%.*]] = fmul double [[P1:%.*]], 1.638400e+04
|
|
||||||
; CHECK-NEXT: [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04
|
; CHECK-NEXT: [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04
|
||||||
; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03
|
; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03
|
||||||
; CHECK-NEXT: [[MUL21:%.*]] = fmul double [[P2:%.*]], 1.638400e+04
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P1:%.*]], i32 0
|
||||||
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P2:%.*]], i32 1
|
||||||
|
; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 1.638400e+04, double 1.638400e+04>
|
||||||
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[ADD]], i32 1
|
||||||
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
||||||
; CHECK: for.body:
|
; CHECK: for.body:
|
||||||
; CHECK-NEXT: [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ]
|
; CHECK-NEXT: [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ]
|
||||||
; CHECK-NEXT: [[T_0259:%.*]] = phi double [ 0.000000e+00, [[BB1]] ], [ [[ADD27:%.*]], [[FOR_BODY]] ]
|
; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x double> [ [[TMP3]], [[BB1]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ]
|
||||||
; CHECK-NEXT: [[P3_ADDR_0258:%.*]] = phi double [ [[ADD]], [[BB1]] ], [ [[ADD28:%.*]], [[FOR_BODY]] ]
|
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
|
||||||
; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> poison, double [[T_0259]], i32 0
|
; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
|
||||||
; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I237]])
|
; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I237]])
|
||||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]]
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]]
|
||||||
; CHECK-NEXT: store i32 [[X13]], i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]]
|
; CHECK-NEXT: store i32 [[X13]], i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]]
|
||||||
; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> poison, double [[P3_ADDR_0258]], i32 0
|
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
|
||||||
|
; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> poison, double [[TMP6]], i32 0
|
||||||
; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I]])
|
; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I]])
|
||||||
; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]]
|
; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]]
|
||||||
; CHECK-NEXT: store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, !tbaa [[TBAA0]]
|
; CHECK-NEXT: store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, !tbaa [[TBAA0]]
|
||||||
; CHECK-NEXT: [[ADD27]] = fadd double [[MUL19]], [[T_0259]]
|
; CHECK-NEXT: [[TMP7]] = fadd <2 x double> [[TMP2]], [[TMP4]]
|
||||||
; CHECK-NEXT: [[ADD28]] = fadd double [[MUL21]], [[P3_ADDR_0258]]
|
|
||||||
; CHECK-NEXT: [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1
|
; CHECK-NEXT: [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1
|
||||||
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256
|
||||||
; CHECK-NEXT: br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]]
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]]
|
||||||
|
|
|
@ -11,25 +11,27 @@ define void @_foo(double %p1, double %p2, double %p3) #0 {
|
||||||
; CHECK-NEXT: [[TAB2:%.*]] = alloca [256 x i32], align 16
|
; CHECK-NEXT: [[TAB2:%.*]] = alloca [256 x i32], align 16
|
||||||
; CHECK-NEXT: br label [[BB1:%.*]]
|
; CHECK-NEXT: br label [[BB1:%.*]]
|
||||||
; CHECK: bb1:
|
; CHECK: bb1:
|
||||||
; CHECK-NEXT: [[MUL19:%.*]] = fmul double [[P1:%.*]], 1.638400e+04
|
|
||||||
; CHECK-NEXT: [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04
|
; CHECK-NEXT: [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04
|
||||||
; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03
|
; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03
|
||||||
; CHECK-NEXT: [[MUL21:%.*]] = fmul double [[P2:%.*]], 1.638400e+04
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P1:%.*]], i32 0
|
||||||
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P2:%.*]], i32 1
|
||||||
|
; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 1.638400e+04, double 1.638400e+04>
|
||||||
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[ADD]], i32 1
|
||||||
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
||||||
; CHECK: for.body:
|
; CHECK: for.body:
|
||||||
; CHECK-NEXT: [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ]
|
; CHECK-NEXT: [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ]
|
||||||
; CHECK-NEXT: [[T_0259:%.*]] = phi double [ 0.000000e+00, [[BB1]] ], [ [[ADD27:%.*]], [[FOR_BODY]] ]
|
; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x double> [ [[TMP3]], [[BB1]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ]
|
||||||
; CHECK-NEXT: [[P3_ADDR_0258:%.*]] = phi double [ [[ADD]], [[BB1]] ], [ [[ADD28:%.*]], [[FOR_BODY]] ]
|
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
|
||||||
; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> undef, double [[T_0259]], i32 0
|
; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0
|
||||||
; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I237]])
|
; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I237]])
|
||||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]]
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]]
|
||||||
; CHECK-NEXT: store i32 [[X13]], i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]]
|
; CHECK-NEXT: store i32 [[X13]], i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]]
|
||||||
; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> undef, double [[P3_ADDR_0258]], i32 0
|
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
|
||||||
|
; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> undef, double [[TMP6]], i32 0
|
||||||
; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I]])
|
; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I]])
|
||||||
; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]]
|
; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]]
|
||||||
; CHECK-NEXT: store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, !tbaa [[TBAA0]]
|
; CHECK-NEXT: store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, !tbaa [[TBAA0]]
|
||||||
; CHECK-NEXT: [[ADD27]] = fadd double [[MUL19]], [[T_0259]]
|
; CHECK-NEXT: [[TMP7]] = fadd <2 x double> [[TMP2]], [[TMP4]]
|
||||||
; CHECK-NEXT: [[ADD28]] = fadd double [[MUL21]], [[P3_ADDR_0258]]
|
|
||||||
; CHECK-NEXT: [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1
|
; CHECK-NEXT: [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1
|
||||||
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256
|
||||||
; CHECK-NEXT: br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]]
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]]
|
||||||
|
|
|
@ -6,28 +6,26 @@ define void @test() {
|
||||||
; CHECK-NEXT: entry:
|
; CHECK-NEXT: entry:
|
||||||
; CHECK-NEXT: br label [[BODY:%.*]]
|
; CHECK-NEXT: br label [[BODY:%.*]]
|
||||||
; CHECK: body:
|
; CHECK: body:
|
||||||
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ zeroinitializer, [[BODY]] ]
|
; CHECK-NEXT: [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ]
|
||||||
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
|
; CHECK-NEXT: [[PHI2:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ 0.000000e+00, [[BODY]] ]
|
||||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP1]], i32 1
|
; CHECK-NEXT: [[MUL_I478_I:%.*]] = fmul fast double [[PHI1]], 0.000000e+00
|
||||||
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP2]], zeroinitializer
|
; CHECK-NEXT: [[MUL7_I485_I:%.*]] = fmul fast double undef, 0.000000e+00
|
||||||
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
|
; CHECK-NEXT: [[ADD8_I_I:%.*]] = fadd fast double [[MUL_I478_I]], [[MUL7_I485_I]]
|
||||||
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
|
|
||||||
; CHECK-NEXT: [[ADD8_I_I:%.*]] = fadd fast double [[TMP5]], [[TMP4]]
|
|
||||||
; CHECK-NEXT: [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00
|
; CHECK-NEXT: [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00
|
||||||
; CHECK-NEXT: br i1 false, label [[BODY]], label [[EXIT:%.*]]
|
; CHECK-NEXT: br i1 false, label [[BODY]], label [[EXIT:%.*]]
|
||||||
; CHECK: exit:
|
; CHECK: exit:
|
||||||
; CHECK-NEXT: br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]]
|
; CHECK-NEXT: br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]]
|
||||||
; CHECK: if.then135.i:
|
; CHECK: if.then135.i:
|
||||||
; CHECK-NEXT: [[TMP6:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer
|
; CHECK-NEXT: [[CMP145_I:%.*]] = fcmp fast olt double [[PHI1]], 0.000000e+00
|
||||||
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
|
; CHECK-NEXT: [[CMP152_I:%.*]] = fcmp fast olt double [[PHI2]], 0.000000e+00
|
||||||
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i1> <i1 poison, i1 false>, i1 [[TMP7]], i32 0
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i1> <i1 poison, i1 false>, i1 [[CMP152_I]], i32 0
|
||||||
; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
|
; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> [[TMP0]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
|
||||||
; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP9]]
|
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP1]]
|
||||||
; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <2 x double> [[TMP10]], zeroinitializer
|
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP2]], zeroinitializer
|
||||||
; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP11]], zeroinitializer
|
; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], zeroinitializer
|
||||||
; CHECK-NEXT: br label [[IF_END209_I]]
|
; CHECK-NEXT: br label [[IF_END209_I]]
|
||||||
; CHECK: if.end209.i:
|
; CHECK: if.end209.i:
|
||||||
; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x double> [ [[TMP12]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
|
; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x double> [ [[TMP4]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
|
||||||
; CHECK-NEXT: ret void
|
; CHECK-NEXT: ret void
|
||||||
;
|
;
|
||||||
entry:
|
entry:
|
||||||
|
|
Loading…
Reference in New Issue