forked from OSchip/llvm-project
[SLP] Fix for PR32086: Count InsertElementInstr of the same elements as shuffle.
Summary: If the same value is going to be vectorized several times in the same tree entry, this entry is considered to be a gather entry and cost of this gather is counter as cost of InsertElementInstrs for each gathered value. But we can consider these elements as ShuffleInstr with SK_PermuteSingle shuffle kind. Reviewers: spatel, RKSimon, mkuper, hfinkel Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D38697 llvm-svn: 323348
This commit is contained in:
parent
abdcc613b6
commit
4bd8e5332f
|
@ -662,13 +662,9 @@ private:
|
|||
/// Vectorize a single entry in the tree, starting in \p VL.
|
||||
Value *vectorizeTree(ArrayRef<Value *> VL);
|
||||
|
||||
/// \returns the pointer to the vectorized value if \p VL is already
|
||||
/// vectorized, or NULL. They may happen in cycles.
|
||||
Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const;
|
||||
|
||||
/// \returns the scalarization cost for this type. Scalarization in this
|
||||
/// context means the creation of vectors from a group of scalars.
|
||||
int getGatherCost(Type *Ty);
|
||||
int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices);
|
||||
|
||||
/// \returns the scalarization cost for this list of values. Assuming that
|
||||
/// this subtree gets vectorized, we may need to extract the values from the
|
||||
|
@ -702,8 +698,12 @@ private:
|
|||
|
||||
/// \returns true if the scalars in VL are equal to this entry.
|
||||
bool isSame(ArrayRef<Value *> VL) const {
|
||||
assert(VL.size() == Scalars.size() && "Invalid size");
|
||||
return std::equal(VL.begin(), VL.end(), Scalars.begin());
|
||||
if (VL.size() == Scalars.size())
|
||||
return std::equal(VL.begin(), VL.end(), Scalars.begin());
|
||||
assert(VL.size() == ReuseShuffleIndices.size() && "Invalid size");
|
||||
return std::equal(
|
||||
VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
|
||||
[this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
|
||||
}
|
||||
|
||||
/// A vector of scalars.
|
||||
|
@ -715,6 +715,9 @@ private:
|
|||
/// Do we need to gather this sequence ?
|
||||
bool NeedToGather = false;
|
||||
|
||||
/// Does this sequence require some shuffling?
|
||||
SmallVector<unsigned, 4> ReuseShuffleIndices;
|
||||
|
||||
/// Points back to the VectorizableTree.
|
||||
///
|
||||
/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
|
||||
|
@ -729,13 +732,15 @@ private:
|
|||
};
|
||||
|
||||
/// Create a new VectorizableTree entry.
|
||||
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
|
||||
int &UserTreeIdx) {
|
||||
void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
|
||||
ArrayRef<unsigned> ReuseShuffleIndices = None) {
|
||||
VectorizableTree.emplace_back(VectorizableTree);
|
||||
int idx = VectorizableTree.size() - 1;
|
||||
TreeEntry *Last = &VectorizableTree[idx];
|
||||
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
|
||||
Last->NeedToGather = !Vectorized;
|
||||
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
|
||||
ReuseShuffleIndices.end());
|
||||
if (Vectorized) {
|
||||
for (int i = 0, e = VL.size(); i != e; ++i) {
|
||||
assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
|
||||
|
@ -748,7 +753,6 @@ private:
|
|||
if (UserTreeIdx >= 0)
|
||||
Last->UserTreeIndices.push_back(UserTreeIdx);
|
||||
UserTreeIdx = idx;
|
||||
return Last;
|
||||
}
|
||||
|
||||
/// -- Vectorization State --
|
||||
|
@ -762,13 +766,6 @@ private:
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
const TreeEntry *getTreeEntry(Value *V) const {
|
||||
auto I = ScalarToTreeEntry.find(V);
|
||||
if (I != ScalarToTreeEntry.end())
|
||||
return &VectorizableTree[I->second];
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/// Maps a specific scalar to its tree entry.
|
||||
SmallDenseMap<Value*, int> ScalarToTreeEntry;
|
||||
|
||||
|
@ -1484,13 +1481,26 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
}
|
||||
|
||||
// Check that every instruction appears once in this bundle.
|
||||
for (unsigned i = 0, e = VL.size(); i < e; ++i)
|
||||
for (unsigned j = i + 1; j < e; ++j)
|
||||
if (VL[i] == VL[j]) {
|
||||
DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
return;
|
||||
}
|
||||
SmallVector<unsigned, 4> ReuseShuffleIndicies;
|
||||
SmallVector<Value *, 4> UniqueValues;
|
||||
DenseMap<Value *, unsigned> UniquePositions;
|
||||
for (Value *V : VL) {
|
||||
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
|
||||
ReuseShuffleIndicies.emplace_back(Res.first->second);
|
||||
if (Res.second)
|
||||
UniqueValues.emplace_back(V);
|
||||
}
|
||||
if (UniqueValues.size() == VL.size()) {
|
||||
ReuseShuffleIndicies.clear();
|
||||
} else {
|
||||
DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
|
||||
if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
|
||||
DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
return;
|
||||
}
|
||||
VL = UniqueValues;
|
||||
}
|
||||
|
||||
auto &BSRef = BlocksSchedules[BB];
|
||||
if (!BSRef)
|
||||
|
@ -1498,12 +1508,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
|
||||
BlockScheduling &BS = *BSRef.get();
|
||||
|
||||
if (!BS.tryScheduleBundle(VL, this, S.OpValue)) {
|
||||
if (!BS.tryScheduleBundle(VL, this, VL0)) {
|
||||
DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
|
||||
assert((!BS.getScheduleData(VL0) ||
|
||||
!BS.getScheduleData(VL0)->isPartOfBundle()) &&
|
||||
"tryScheduleBundle should cancelScheduling on failure");
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
return;
|
||||
}
|
||||
DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
|
||||
|
@ -1522,12 +1532,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
if (Term) {
|
||||
DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
newTreeEntry(VL, true, UserTreeIdx);
|
||||
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
|
||||
|
||||
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
|
||||
|
@ -1545,7 +1555,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
case Instruction::ExtractElement: {
|
||||
bool Reuse = canReuseExtract(VL, VL0);
|
||||
if (Reuse) {
|
||||
DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
|
||||
DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
|
||||
++NumOpsWantToKeepOrder[S.Opcode];
|
||||
} else {
|
||||
SmallVector<Value *, 4> ReverseVL(VL.rbegin(), VL.rend());
|
||||
|
@ -1553,7 +1563,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
--NumOpsWantToKeepOrder[S.Opcode];
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
}
|
||||
newTreeEntry(VL, Reuse, UserTreeIdx);
|
||||
newTreeEntry(VL, Reuse, UserTreeIdx, ReuseShuffleIndicies);
|
||||
return;
|
||||
}
|
||||
case Instruction::Load: {
|
||||
|
@ -1568,7 +1578,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
if (DL->getTypeSizeInBits(ScalarTy) !=
|
||||
DL->getTypeAllocSizeInBits(ScalarTy)) {
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
|
||||
return;
|
||||
}
|
||||
|
@ -1579,7 +1589,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
LoadInst *L = cast<LoadInst>(VL[i]);
|
||||
if (!L->isSimple()) {
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
|
||||
return;
|
||||
}
|
||||
|
@ -1601,7 +1611,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
|
||||
if (Consecutive) {
|
||||
++NumOpsWantToKeepOrder[S.Opcode];
|
||||
newTreeEntry(VL, true, UserTreeIdx);
|
||||
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: added a vector of loads.\n");
|
||||
return;
|
||||
}
|
||||
|
@ -1616,7 +1626,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
}
|
||||
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
|
||||
if (ReverseConsecutive) {
|
||||
--NumOpsWantToKeepOrder[S.Opcode];
|
||||
|
@ -1643,12 +1653,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
|
||||
if (Ty != SrcTy || !isValidElementType(Ty)) {
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
newTreeEntry(VL, true, UserTreeIdx);
|
||||
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: added a vector of casts.\n");
|
||||
|
||||
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
|
||||
|
@ -1671,13 +1681,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
if (Cmp->getPredicate() != P0 ||
|
||||
Cmp->getOperand(0)->getType() != ComparedTy) {
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
newTreeEntry(VL, true, UserTreeIdx);
|
||||
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: added a vector of compares.\n");
|
||||
|
||||
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
|
||||
|
@ -1709,7 +1719,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
case Instruction::And:
|
||||
case Instruction::Or:
|
||||
case Instruction::Xor:
|
||||
newTreeEntry(VL, true, UserTreeIdx);
|
||||
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
|
||||
|
||||
// Sort operands of the instructions so that each side is more likely to
|
||||
|
@ -1738,7 +1748,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
|
||||
DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -1751,7 +1761,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
if (Ty0 != CurTy) {
|
||||
DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -1763,12 +1773,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
DEBUG(
|
||||
dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
newTreeEntry(VL, true, UserTreeIdx);
|
||||
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
|
||||
for (unsigned i = 0, e = 2; i < e; ++i) {
|
||||
ValueList Operands;
|
||||
|
@ -1785,12 +1795,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
|
||||
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
newTreeEntry(VL, true, UserTreeIdx);
|
||||
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: added a vector of stores.\n");
|
||||
|
||||
ValueList Operands;
|
||||
|
@ -1808,7 +1818,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
|
||||
if (!isTriviallyVectorizable(ID)) {
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
|
||||
return;
|
||||
}
|
||||
|
@ -1822,7 +1832,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
|
||||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
|
||||
<< "\n");
|
||||
return;
|
||||
|
@ -1833,7 +1843,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
Value *A1J = CI2->getArgOperand(1);
|
||||
if (A1I != A1J) {
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
|
||||
<< " argument "<< A1I<<"!=" << A1J
|
||||
<< "\n");
|
||||
|
@ -1846,14 +1856,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
CI->op_begin() + CI->getBundleOperandsEndIndex(),
|
||||
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
|
||||
<< *VL[i] << '\n');
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
newTreeEntry(VL, true, UserTreeIdx);
|
||||
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
|
||||
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
|
||||
ValueList Operands;
|
||||
// Prepare the operand vector.
|
||||
|
@ -1870,11 +1880,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
// then do not vectorize this instruction.
|
||||
if (!S.IsAltShuffle) {
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
|
||||
return;
|
||||
}
|
||||
newTreeEntry(VL, true, UserTreeIdx);
|
||||
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
|
||||
|
||||
// Reorder operands if reordering would enable vectorization.
|
||||
|
@ -1898,7 +1908,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
|
||||
default:
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
|
||||
return;
|
||||
}
|
||||
|
@ -1991,13 +2001,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
VecTy = VectorType::get(
|
||||
IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
|
||||
|
||||
unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
|
||||
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
|
||||
int ReuseShuffleCost = 0;
|
||||
if (NeedToShuffleReuses) {
|
||||
ReuseShuffleCost =
|
||||
TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
|
||||
}
|
||||
if (E->NeedToGather) {
|
||||
if (allConstant(VL))
|
||||
return 0;
|
||||
if (isSplat(VL)) {
|
||||
return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
|
||||
return ReuseShuffleCost +
|
||||
TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
|
||||
}
|
||||
if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) {
|
||||
if (getSameOpcode(VL).Opcode == Instruction::ExtractElement &&
|
||||
allSameType(VL) && allSameBlock(VL)) {
|
||||
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
|
||||
if (ShuffleKind.hasValue()) {
|
||||
int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
|
||||
|
@ -2014,10 +2033,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
IO->getZExtValue());
|
||||
}
|
||||
}
|
||||
return Cost;
|
||||
return ReuseShuffleCost + Cost;
|
||||
}
|
||||
}
|
||||
return getGatherCost(E->Scalars);
|
||||
return ReuseShuffleCost + getGatherCost(VL);
|
||||
}
|
||||
InstructionsState S = getSameOpcode(VL);
|
||||
assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
|
||||
|
@ -2030,8 +2049,36 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
|
||||
case Instruction::ExtractValue:
|
||||
case Instruction::ExtractElement:
|
||||
if (NeedToShuffleReuses) {
|
||||
unsigned Idx = 0;
|
||||
for (unsigned I : E->ReuseShuffleIndices) {
|
||||
if (ShuffleOrOp == Instruction::ExtractElement) {
|
||||
auto *IO = cast<ConstantInt>(
|
||||
cast<ExtractElementInst>(VL[I])->getIndexOperand());
|
||||
Idx = IO->getZExtValue();
|
||||
ReuseShuffleCost -= TTI->getVectorInstrCost(
|
||||
Instruction::ExtractElement, VecTy, Idx);
|
||||
} else {
|
||||
ReuseShuffleCost -= TTI->getVectorInstrCost(
|
||||
Instruction::ExtractElement, VecTy, Idx);
|
||||
++Idx;
|
||||
}
|
||||
}
|
||||
Idx = ReuseShuffleNumbers;
|
||||
for (Value *V : VL) {
|
||||
if (ShuffleOrOp == Instruction::ExtractElement) {
|
||||
auto *IO = cast<ConstantInt>(
|
||||
cast<ExtractElementInst>(V)->getIndexOperand());
|
||||
Idx = IO->getZExtValue();
|
||||
} else {
|
||||
--Idx;
|
||||
}
|
||||
ReuseShuffleCost +=
|
||||
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
|
||||
}
|
||||
}
|
||||
if (canReuseExtract(VL, S.OpValue)) {
|
||||
int DeadCost = 0;
|
||||
int DeadCost = ReuseShuffleCost;
|
||||
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
|
||||
Instruction *E = cast<Instruction>(VL[i]);
|
||||
// If all users are going to be vectorized, instruction can be
|
||||
|
@ -2039,12 +2086,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
// The same, if have only one user, it will be vectorized for sure.
|
||||
if (areAllUsersVectorized(E))
|
||||
// Take credit for instruction that will become dead.
|
||||
DeadCost +=
|
||||
DeadCost -=
|
||||
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
|
||||
}
|
||||
return -DeadCost;
|
||||
return DeadCost;
|
||||
}
|
||||
return getGatherCost(VecTy);
|
||||
return ReuseShuffleCost + getGatherCost(VL);
|
||||
|
||||
case Instruction::ZExt:
|
||||
case Instruction::SExt:
|
||||
|
@ -2059,6 +2106,11 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
case Instruction::FPTrunc:
|
||||
case Instruction::BitCast: {
|
||||
Type *SrcTy = VL0->getOperand(0)->getType();
|
||||
if (NeedToShuffleReuses) {
|
||||
ReuseShuffleCost -=
|
||||
(ReuseShuffleNumbers - VL.size()) *
|
||||
TTI->getCastInstrCost(S.Opcode, ScalarTy, SrcTy, VL0);
|
||||
}
|
||||
|
||||
// Calculate the cost of this instruction.
|
||||
int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
|
||||
|
@ -2067,19 +2119,26 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
|
||||
int VecCost = 0;
|
||||
// Check if the values are candidates to demote.
|
||||
if (!MinBWs.count(VL0) || VecTy != SrcVecTy)
|
||||
VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
|
||||
if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
|
||||
VecCost = ReuseShuffleCost +
|
||||
TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
|
||||
}
|
||||
return VecCost - ScalarCost;
|
||||
}
|
||||
case Instruction::FCmp:
|
||||
case Instruction::ICmp:
|
||||
case Instruction::Select: {
|
||||
// Calculate the cost of this instruction.
|
||||
if (NeedToShuffleReuses) {
|
||||
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
|
||||
TTI->getCmpSelInstrCost(S.Opcode, ScalarTy,
|
||||
Builder.getInt1Ty(), VL0);
|
||||
}
|
||||
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
|
||||
int ScalarCost = VecTy->getNumElements() *
|
||||
TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
|
||||
int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
|
||||
return VecCost - ScalarCost;
|
||||
return ReuseShuffleCost + VecCost - ScalarCost;
|
||||
}
|
||||
case Instruction::Add:
|
||||
case Instruction::FAdd:
|
||||
|
@ -2137,13 +2196,19 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
Op2VP = TargetTransformInfo::OP_PowerOf2;
|
||||
|
||||
SmallVector<const Value *, 4> Operands(VL0->operand_values());
|
||||
if (NeedToShuffleReuses) {
|
||||
ReuseShuffleCost -=
|
||||
(ReuseShuffleNumbers - VL.size()) *
|
||||
TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
|
||||
Op2VP, Operands);
|
||||
}
|
||||
int ScalarCost =
|
||||
VecTy->getNumElements() *
|
||||
TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
|
||||
Op2VP, Operands);
|
||||
int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
|
||||
Op1VP, Op2VP, Operands);
|
||||
return VecCost - ScalarCost;
|
||||
return ReuseShuffleCost + VecCost - ScalarCost;
|
||||
}
|
||||
case Instruction::GetElementPtr: {
|
||||
TargetTransformInfo::OperandValueKind Op1VK =
|
||||
|
@ -2151,31 +2216,46 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
TargetTransformInfo::OperandValueKind Op2VK =
|
||||
TargetTransformInfo::OK_UniformConstantValue;
|
||||
|
||||
if (NeedToShuffleReuses) {
|
||||
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
|
||||
TTI->getArithmeticInstrCost(Instruction::Add,
|
||||
ScalarTy, Op1VK, Op2VK);
|
||||
}
|
||||
int ScalarCost =
|
||||
VecTy->getNumElements() *
|
||||
TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
|
||||
int VecCost =
|
||||
TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
|
||||
|
||||
return VecCost - ScalarCost;
|
||||
return ReuseShuffleCost + VecCost - ScalarCost;
|
||||
}
|
||||
case Instruction::Load: {
|
||||
// Cost of wide load - cost of scalar loads.
|
||||
unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
|
||||
if (NeedToShuffleReuses) {
|
||||
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
|
||||
TTI->getMemoryOpCost(Instruction::Load, ScalarTy,
|
||||
alignment, 0, VL0);
|
||||
}
|
||||
int ScalarLdCost = VecTy->getNumElements() *
|
||||
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
|
||||
int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
|
||||
VecTy, alignment, 0, VL0);
|
||||
return VecLdCost - ScalarLdCost;
|
||||
return ReuseShuffleCost + VecLdCost - ScalarLdCost;
|
||||
}
|
||||
case Instruction::Store: {
|
||||
// We know that we can merge the stores. Calculate the cost.
|
||||
unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
|
||||
if (NeedToShuffleReuses) {
|
||||
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
|
||||
TTI->getMemoryOpCost(Instruction::Store, ScalarTy,
|
||||
alignment, 0, VL0);
|
||||
}
|
||||
int ScalarStCost = VecTy->getNumElements() *
|
||||
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
|
||||
int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
|
||||
VecTy, alignment, 0, VL0);
|
||||
return VecStCost - ScalarStCost;
|
||||
return ReuseShuffleCost + VecStCost - ScalarStCost;
|
||||
}
|
||||
case Instruction::Call: {
|
||||
CallInst *CI = cast<CallInst>(VL0);
|
||||
|
@ -2190,6 +2270,11 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
|
||||
FMF = FPMO->getFastMathFlags();
|
||||
|
||||
if (NeedToShuffleReuses) {
|
||||
ReuseShuffleCost -=
|
||||
(ReuseShuffleNumbers - VL.size()) *
|
||||
TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
|
||||
}
|
||||
int ScalarCallCost = VecTy->getNumElements() *
|
||||
TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
|
||||
|
||||
|
@ -2201,7 +2286,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
|
||||
<< " for " << *CI << "\n");
|
||||
|
||||
return VecCallCost - ScalarCallCost;
|
||||
return ReuseShuffleCost + VecCallCost - ScalarCallCost;
|
||||
}
|
||||
case Instruction::ShuffleVector: {
|
||||
TargetTransformInfo::OperandValueKind Op1VK =
|
||||
|
@ -2209,6 +2294,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
TargetTransformInfo::OperandValueKind Op2VK =
|
||||
TargetTransformInfo::OK_AnyValue;
|
||||
int ScalarCost = 0;
|
||||
if (NeedToShuffleReuses) {
|
||||
for (unsigned Idx : E->ReuseShuffleIndices) {
|
||||
Instruction *I = cast<Instruction>(VL[Idx]);
|
||||
if (!I)
|
||||
continue;
|
||||
ReuseShuffleCost -= TTI->getArithmeticInstrCost(
|
||||
I->getOpcode(), ScalarTy, Op1VK, Op2VK);
|
||||
}
|
||||
for (Value *V : VL) {
|
||||
Instruction *I = cast<Instruction>(V);
|
||||
if (!I)
|
||||
continue;
|
||||
ReuseShuffleCost += TTI->getArithmeticInstrCost(
|
||||
I->getOpcode(), ScalarTy, Op1VK, Op2VK);
|
||||
}
|
||||
}
|
||||
int VecCost = 0;
|
||||
for (Value *i : VL) {
|
||||
Instruction *I = cast<Instruction>(i);
|
||||
|
@ -2227,7 +2328,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
|
||||
VecCost +=
|
||||
TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
|
||||
return VecCost - ScalarCost;
|
||||
return ReuseShuffleCost + VecCost - ScalarCost;
|
||||
}
|
||||
default:
|
||||
llvm_unreachable("Unknown instruction");
|
||||
|
@ -2252,7 +2353,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree() {
|
|||
return true;
|
||||
|
||||
// Gathering cost would be too much for tiny trees.
|
||||
if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
|
||||
if ((VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) &&
|
||||
VectorizableTree[0].Scalars.size() < 4)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
|
@ -2403,10 +2505,14 @@ int BoUpSLP::getTreeCost() {
|
|||
return Cost;
|
||||
}
|
||||
|
||||
int BoUpSLP::getGatherCost(Type *Ty) {
|
||||
int BoUpSLP::getGatherCost(Type *Ty,
|
||||
const DenseSet<unsigned> &ShuffledIndices) {
|
||||
int Cost = 0;
|
||||
for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
|
||||
Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
|
||||
if (!ShuffledIndices.count(i))
|
||||
Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
|
||||
if (!ShuffledIndices.empty())
|
||||
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
|
||||
return Cost;
|
||||
}
|
||||
|
||||
|
@ -2417,7 +2523,17 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
|
|||
ScalarTy = SI->getValueOperand()->getType();
|
||||
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
|
||||
// Find the cost of inserting/extracting values from the vector.
|
||||
return getGatherCost(VecTy);
|
||||
// Check if the same elements are inserted several times and count them as
|
||||
// shuffle candidates.
|
||||
DenseSet<unsigned> ShuffledElements;
|
||||
DenseSet<Value *> UniqueElements;
|
||||
// Iterate in reverse order to consider insert elements with the high cost.
|
||||
for (unsigned I = VL.size(); I > 0; --I) {
|
||||
unsigned Idx = I - 1;
|
||||
if (!UniqueElements.insert(VL[Idx]).second)
|
||||
ShuffledElements.insert(Idx);
|
||||
}
|
||||
return getGatherCost(VecTy, ShuffledElements);
|
||||
}
|
||||
|
||||
// Reorder commutative operations in alternate shuffle if the resulting vectors
|
||||
|
@ -2731,14 +2847,6 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
|
|||
return Vec;
|
||||
}
|
||||
|
||||
Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
|
||||
if (const TreeEntry *En = getTreeEntry(OpValue)) {
|
||||
if (En->isSame(VL) && En->VectorizedValue)
|
||||
return En->VectorizedValue;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
|
||||
InstructionsState S = getSameOpcode(VL);
|
||||
if (S.Opcode) {
|
||||
|
@ -2751,9 +2859,38 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
|
|||
Type *ScalarTy = S.OpValue->getType();
|
||||
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
|
||||
ScalarTy = SI->getValueOperand()->getType();
|
||||
|
||||
// Check that every instruction appears once in this bundle.
|
||||
SmallVector<unsigned, 4> ReuseShuffleIndicies;
|
||||
SmallVector<Value *, 4> UniqueValues;
|
||||
if (VL.size() > 2) {
|
||||
DenseMap<Value *, unsigned> UniquePositions;
|
||||
for (Value *V : VL) {
|
||||
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
|
||||
ReuseShuffleIndicies.emplace_back(Res.first->second);
|
||||
if (Res.second || isa<Constant>(V))
|
||||
UniqueValues.emplace_back(V);
|
||||
}
|
||||
// Do not shuffle single element or if number of unique values is not power
|
||||
// of 2.
|
||||
if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
|
||||
!llvm::isPowerOf2_32(UniqueValues.size()))
|
||||
ReuseShuffleIndicies.clear();
|
||||
else
|
||||
VL = UniqueValues;
|
||||
}
|
||||
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
|
||||
|
||||
return Gather(VL, VecTy);
|
||||
Value *V = Gather(VL, VecTy);
|
||||
if (!ReuseShuffleIndicies.empty()) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
ReuseShuffleIndicies, "shuffle");
|
||||
if (auto *I = dyn_cast<Instruction>(V)) {
|
||||
GatherSeq.insert(I);
|
||||
CSEBlocks.insert(I->getParent());
|
||||
}
|
||||
}
|
||||
return V;
|
||||
}
|
||||
|
||||
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
||||
|
@ -2771,9 +2908,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
ScalarTy = SI->getValueOperand()->getType();
|
||||
VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
|
||||
|
||||
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
|
||||
|
||||
if (E->NeedToGather) {
|
||||
setInsertPointAfterBundle(E->Scalars, VL0);
|
||||
auto *V = Gather(E->Scalars, VecTy);
|
||||
if (NeedToShuffleReuses) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
if (auto *I = dyn_cast<Instruction>(V)) {
|
||||
GatherSeq.insert(I);
|
||||
CSEBlocks.insert(I->getParent());
|
||||
}
|
||||
}
|
||||
E->VectorizedValue = V;
|
||||
return V;
|
||||
}
|
||||
|
@ -2786,7 +2933,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
|
||||
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
|
||||
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
|
||||
E->VectorizedValue = NewPhi;
|
||||
Value *V = NewPhi;
|
||||
if (NeedToShuffleReuses) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
}
|
||||
E->VectorizedValue = V;
|
||||
|
||||
// PHINodes may have multiple entries from the same block. We want to
|
||||
// visit every block once.
|
||||
|
@ -2813,17 +2965,30 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
|
||||
assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
|
||||
"Invalid number of incoming values");
|
||||
return NewPhi;
|
||||
return V;
|
||||
}
|
||||
|
||||
case Instruction::ExtractElement: {
|
||||
if (canReuseExtract(E->Scalars, VL0)) {
|
||||
Value *V = VL0->getOperand(0);
|
||||
if (NeedToShuffleReuses) {
|
||||
Builder.SetInsertPoint(VL0);
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
}
|
||||
E->VectorizedValue = V;
|
||||
return V;
|
||||
}
|
||||
setInsertPointAfterBundle(E->Scalars, VL0);
|
||||
auto *V = Gather(E->Scalars, VecTy);
|
||||
if (NeedToShuffleReuses) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
if (auto *I = dyn_cast<Instruction>(V)) {
|
||||
GatherSeq.insert(I);
|
||||
CSEBlocks.insert(I->getParent());
|
||||
}
|
||||
}
|
||||
E->VectorizedValue = V;
|
||||
return V;
|
||||
}
|
||||
|
@ -2834,11 +2999,24 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
|
||||
Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
|
||||
LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
|
||||
E->VectorizedValue = V;
|
||||
return propagateMetadata(V, E->Scalars);
|
||||
Value *NewV = propagateMetadata(V, E->Scalars);
|
||||
if (NeedToShuffleReuses) {
|
||||
NewV = Builder.CreateShuffleVector(
|
||||
NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
|
||||
}
|
||||
E->VectorizedValue = NewV;
|
||||
return NewV;
|
||||
}
|
||||
setInsertPointAfterBundle(E->Scalars, VL0);
|
||||
auto *V = Gather(E->Scalars, VecTy);
|
||||
if (NeedToShuffleReuses) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
if (auto *I = dyn_cast<Instruction>(V)) {
|
||||
GatherSeq.insert(I);
|
||||
CSEBlocks.insert(I->getParent());
|
||||
}
|
||||
}
|
||||
E->VectorizedValue = V;
|
||||
return V;
|
||||
}
|
||||
|
@ -2862,11 +3040,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
|
||||
Value *InVec = vectorizeTree(INVL);
|
||||
|
||||
if (Value *V = alreadyVectorized(E->Scalars, VL0))
|
||||
return V;
|
||||
if (E->VectorizedValue) {
|
||||
DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
|
||||
return E->VectorizedValue;
|
||||
}
|
||||
|
||||
CastInst *CI = dyn_cast<CastInst>(VL0);
|
||||
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
|
||||
if (NeedToShuffleReuses) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
}
|
||||
E->VectorizedValue = V;
|
||||
++NumVectorInstructions;
|
||||
return V;
|
||||
|
@ -2884,8 +3068,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
Value *L = vectorizeTree(LHSV);
|
||||
Value *R = vectorizeTree(RHSV);
|
||||
|
||||
if (Value *V = alreadyVectorized(E->Scalars, VL0))
|
||||
return V;
|
||||
if (E->VectorizedValue) {
|
||||
DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
|
||||
return E->VectorizedValue;
|
||||
}
|
||||
|
||||
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
|
||||
Value *V;
|
||||
|
@ -2894,8 +3080,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
else
|
||||
V = Builder.CreateICmp(P0, L, R);
|
||||
|
||||
propagateIRFlags(V, E->Scalars, VL0);
|
||||
if (NeedToShuffleReuses) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
}
|
||||
E->VectorizedValue = V;
|
||||
propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
|
||||
++NumVectorInstructions;
|
||||
return V;
|
||||
}
|
||||
|
@ -2913,10 +3103,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
Value *True = vectorizeTree(TrueVec);
|
||||
Value *False = vectorizeTree(FalseVec);
|
||||
|
||||
if (Value *V = alreadyVectorized(E->Scalars, VL0))
|
||||
return V;
|
||||
if (E->VectorizedValue) {
|
||||
DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
|
||||
return E->VectorizedValue;
|
||||
}
|
||||
|
||||
Value *V = Builder.CreateSelect(Cond, True, False);
|
||||
if (NeedToShuffleReuses) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
}
|
||||
E->VectorizedValue = V;
|
||||
++NumVectorInstructions;
|
||||
return V;
|
||||
|
@ -2955,17 +3151,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
Value *LHS = vectorizeTree(LHSVL);
|
||||
Value *RHS = vectorizeTree(RHSVL);
|
||||
|
||||
if (Value *V = alreadyVectorized(E->Scalars, VL0))
|
||||
return V;
|
||||
if (E->VectorizedValue) {
|
||||
DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
|
||||
return E->VectorizedValue;
|
||||
}
|
||||
|
||||
Value *V = Builder.CreateBinOp(
|
||||
static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
|
||||
E->VectorizedValue = V;
|
||||
propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
|
||||
++NumVectorInstructions;
|
||||
propagateIRFlags(V, E->Scalars, VL0);
|
||||
if (auto *I = dyn_cast<Instruction>(V))
|
||||
V = propagateMetadata(I, E->Scalars);
|
||||
|
||||
if (Instruction *I = dyn_cast<Instruction>(V))
|
||||
return propagateMetadata(I, E->Scalars);
|
||||
if (NeedToShuffleReuses) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
}
|
||||
E->VectorizedValue = V;
|
||||
++NumVectorInstructions;
|
||||
|
||||
return V;
|
||||
}
|
||||
|
@ -2994,9 +3196,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
Alignment = DL->getABITypeAlignment(ScalarLoadTy);
|
||||
}
|
||||
LI->setAlignment(Alignment);
|
||||
E->VectorizedValue = LI;
|
||||
Value *V = propagateMetadata(LI, E->Scalars);
|
||||
if (NeedToShuffleReuses) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
}
|
||||
E->VectorizedValue = V;
|
||||
++NumVectorInstructions;
|
||||
return propagateMetadata(LI, E->Scalars);
|
||||
return V;
|
||||
}
|
||||
case Instruction::Store: {
|
||||
StoreInst *SI = cast<StoreInst>(VL0);
|
||||
|
@ -3024,9 +3231,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
|
||||
|
||||
S->setAlignment(Alignment);
|
||||
E->VectorizedValue = S;
|
||||
Value *V = propagateMetadata(S, E->Scalars);
|
||||
if (NeedToShuffleReuses) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
}
|
||||
E->VectorizedValue = V;
|
||||
++NumVectorInstructions;
|
||||
return propagateMetadata(S, E->Scalars);
|
||||
return V;
|
||||
}
|
||||
case Instruction::GetElementPtr: {
|
||||
setInsertPointAfterBundle(E->Scalars, VL0);
|
||||
|
@ -3050,12 +3262,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
|
||||
Value *V = Builder.CreateGEP(
|
||||
cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
|
||||
if (Instruction *I = dyn_cast<Instruction>(V))
|
||||
V = propagateMetadata(I, E->Scalars);
|
||||
|
||||
if (NeedToShuffleReuses) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
}
|
||||
E->VectorizedValue = V;
|
||||
++NumVectorInstructions;
|
||||
|
||||
if (Instruction *I = dyn_cast<Instruction>(V))
|
||||
return propagateMetadata(I, E->Scalars);
|
||||
|
||||
return V;
|
||||
}
|
||||
case Instruction::Call: {
|
||||
|
@ -3102,8 +3318,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
if (ScalarArg && getTreeEntry(ScalarArg))
|
||||
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
|
||||
|
||||
propagateIRFlags(V, E->Scalars, VL0);
|
||||
if (NeedToShuffleReuses) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
}
|
||||
E->VectorizedValue = V;
|
||||
propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
|
||||
++NumVectorInstructions;
|
||||
return V;
|
||||
}
|
||||
|
@ -3117,8 +3337,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
Value *LHS = vectorizeTree(LHSVL);
|
||||
Value *RHS = vectorizeTree(RHSVL);
|
||||
|
||||
if (Value *V = alreadyVectorized(E->Scalars, VL0))
|
||||
return V;
|
||||
if (E->VectorizedValue) {
|
||||
DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
|
||||
return E->VectorizedValue;
|
||||
}
|
||||
|
||||
// Create a vector of LHS op1 RHS
|
||||
Value *V0 = Builder.CreateBinOp(
|
||||
|
@ -3150,10 +3372,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
propagateIRFlags(V1, OddScalars);
|
||||
|
||||
Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
|
||||
if (Instruction *I = dyn_cast<Instruction>(V))
|
||||
V = propagateMetadata(I, E->Scalars);
|
||||
if (NeedToShuffleReuses) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
}
|
||||
E->VectorizedValue = V;
|
||||
++NumVectorInstructions;
|
||||
if (Instruction *I = dyn_cast<Instruction>(V))
|
||||
return propagateMetadata(I, E->Scalars);
|
||||
|
||||
return V;
|
||||
}
|
||||
|
@ -3323,14 +3549,12 @@ void BoUpSLP::optimizeGatherSequence() {
|
|||
DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
|
||||
<< " gather sequences instructions.\n");
|
||||
// LICM InsertElementInst sequences.
|
||||
for (Instruction *it : GatherSeq) {
|
||||
InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
|
||||
|
||||
if (!Insert)
|
||||
for (Instruction *I : GatherSeq) {
|
||||
if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I))
|
||||
continue;
|
||||
|
||||
// Check if this block is inside a loop.
|
||||
Loop *L = LI->getLoopFor(Insert->getParent());
|
||||
Loop *L = LI->getLoopFor(I->getParent());
|
||||
if (!L)
|
||||
continue;
|
||||
|
||||
|
@ -3342,15 +3566,15 @@ void BoUpSLP::optimizeGatherSequence() {
|
|||
// If the vector or the element that we insert into it are
|
||||
// instructions that are defined in this basic block then we can't
|
||||
// hoist this instruction.
|
||||
Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
|
||||
Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
|
||||
if (CurrVec && L->contains(CurrVec))
|
||||
auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
|
||||
auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
|
||||
if (Op0 && L->contains(Op0))
|
||||
continue;
|
||||
if (NewElem && L->contains(NewElem))
|
||||
if (Op1 && L->contains(Op1))
|
||||
continue;
|
||||
|
||||
// We can hoist this instruction. Move it to the pre-header.
|
||||
Insert->moveBefore(PreHeader->getTerminator());
|
||||
I->moveBefore(PreHeader->getTerminator());
|
||||
}
|
||||
|
||||
// Make a list of all reachable blocks in our CSE queue.
|
||||
|
|
|
@ -4,15 +4,14 @@
|
|||
define void @i64_simplified(i64* noalias %st, i64* noalias %ld) {
|
||||
; CHECK-LABEL: @i64_simplified(
|
||||
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
|
||||
; CHECK-NEXT: [[T0:%.*]] = load i64, i64* [[LD]], align 8
|
||||
; CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
|
||||
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
|
||||
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
|
||||
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
|
||||
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
|
||||
; CHECK-NEXT: store i64 [[T0]], i64* [[ST]], align 8
|
||||
; CHECK-NEXT: store i64 [[T1]], i64* [[ARRAYIDX3]], align 8
|
||||
; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX4]], align 8
|
||||
; CHECK-NEXT: store i64 [[T1]], i64* [[ARRAYIDX5]], align 8
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
|
||||
; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1
|
||||
|
|
|
@ -137,17 +137,19 @@ define i8 @k(<4 x i8> %x) {
|
|||
|
||||
define i8 @k_bb(<4 x i8> %x) {
|
||||
; CHECK-LABEL: @k_bb(
|
||||
; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
|
||||
; CHECK-NEXT: br label [[BB1:%.*]]
|
||||
; CHECK: bb1:
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
|
||||
; CHECK-NEXT: ret i8 [[TMP8]]
|
||||
; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
|
||||
; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
|
||||
; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X]], [[X]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X0X0]], [[X3X3]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = sdiv i8 [[TMP2]], [[TMP5]]
|
||||
; CHECK-NEXT: ret i8 [[TMP6]]
|
||||
;
|
||||
%x0 = extractelement <4 x i8> %x, i32 0
|
||||
br label %bb1
|
||||
|
|
|
@ -16,19 +16,18 @@ target triple = "i386-apple-macosx10.9.0"
|
|||
define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) {
|
||||
; CHECK-LABEL: @foo(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[K:%.*]], i32 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[N]], i32 2
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[K]], i32 3
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[N:%.*]], i32 0
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[K:%.*]], i32 1
|
||||
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
|
||||
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
||||
; CHECK: for.body:
|
||||
; CHECK-NEXT: [[I_024:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_024]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
|
||||
; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[SHUFFLE]], [[TMP3]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
|
||||
; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
|
||||
; CHECK-NEXT: [[ADD10]] = add nsw i32 [[I_024]], 4
|
||||
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[ADD10]], 10000
|
||||
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
|
||||
|
|
Loading…
Reference in New Issue