Revert "[SLP] Fix for PR32086: Count InsertElementInstr of the same elements as shuffle."

This reverts commit r323430 to fix buildbots.

llvm-svn: 323432
This commit is contained in:
Alexey Bataev 2018-01-25 15:20:29 +00:00
parent ffbc2380f6
commit a0b2c78efc
4 changed files with 154 additions and 376 deletions

View File

@ -662,9 +662,13 @@ private:
/// Vectorize a single entry in the tree, starting in \p VL. /// Vectorize a single entry in the tree, starting in \p VL.
Value *vectorizeTree(ArrayRef<Value *> VL); Value *vectorizeTree(ArrayRef<Value *> VL);
/// \returns the pointer to the vectorized value if \p VL is already
/// vectorized, or NULL. They may happen in cycles.
Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const;
/// \returns the scalarization cost for this type. Scalarization in this /// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars. /// context means the creation of vectors from a group of scalars.
int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices); int getGatherCost(Type *Ty);
/// \returns the scalarization cost for this list of values. Assuming that /// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the /// this subtree gets vectorized, we may need to extract the values from the
@ -698,12 +702,8 @@ private:
/// \returns true if the scalars in VL are equal to this entry. /// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const { bool isSame(ArrayRef<Value *> VL) const {
if (VL.size() == Scalars.size()) assert(VL.size() == Scalars.size() && "Invalid size");
return std::equal(VL.begin(), VL.end(), Scalars.begin()); return std::equal(VL.begin(), VL.end(), Scalars.begin());
assert(VL.size() == ReuseShuffleIndices.size() && "Invalid size");
return std::equal(
VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
[this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
} }
/// A vector of scalars. /// A vector of scalars.
@ -715,9 +715,6 @@ private:
/// Do we need to gather this sequence ? /// Do we need to gather this sequence ?
bool NeedToGather = false; bool NeedToGather = false;
/// Does this sequence require some shuffling?
SmallVector<unsigned, 4> ReuseShuffleIndices;
/// Points back to the VectorizableTree. /// Points back to the VectorizableTree.
/// ///
/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
@ -732,15 +729,13 @@ private:
}; };
/// Create a new VectorizableTree entry. /// Create a new VectorizableTree entry.
void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx, TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
ArrayRef<unsigned> ReuseShuffleIndices = None) { int &UserTreeIdx) {
VectorizableTree.emplace_back(VectorizableTree); VectorizableTree.emplace_back(VectorizableTree);
int idx = VectorizableTree.size() - 1; int idx = VectorizableTree.size() - 1;
TreeEntry *Last = &VectorizableTree[idx]; TreeEntry *Last = &VectorizableTree[idx];
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->NeedToGather = !Vectorized; Last->NeedToGather = !Vectorized;
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
ReuseShuffleIndices.end());
if (Vectorized) { if (Vectorized) {
for (int i = 0, e = VL.size(); i != e; ++i) { for (int i = 0, e = VL.size(); i != e; ++i) {
assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
@ -753,6 +748,7 @@ private:
if (UserTreeIdx >= 0) if (UserTreeIdx >= 0)
Last->UserTreeIndices.push_back(UserTreeIdx); Last->UserTreeIndices.push_back(UserTreeIdx);
UserTreeIdx = idx; UserTreeIdx = idx;
return Last;
} }
/// -- Vectorization State -- /// -- Vectorization State --
@ -766,6 +762,13 @@ private:
return nullptr; return nullptr;
} }
const TreeEntry *getTreeEntry(Value *V) const {
auto I = ScalarToTreeEntry.find(V);
if (I != ScalarToTreeEntry.end())
return &VectorizableTree[I->second];
return nullptr;
}
/// Maps a specific scalar to its tree entry. /// Maps a specific scalar to its tree entry.
SmallDenseMap<Value*, int> ScalarToTreeEntry; SmallDenseMap<Value*, int> ScalarToTreeEntry;
@ -1429,12 +1432,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Check if this is a duplicate of another entry. // Check if this is a duplicate of another entry.
if (TreeEntry *E = getTreeEntry(S.OpValue)) { if (TreeEntry *E = getTreeEntry(S.OpValue)) {
DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (!E->isSame(VL)) { DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
if (E->Scalars[i] != VL[i]) {
DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
newTreeEntry(VL, false, UserTreeIdx); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
}
// Record the reuse of the tree node. FIXME, currently this is only used to // Record the reuse of the tree node. FIXME, currently this is only used to
// properly draw the graph rather than for the actual vectorization. // properly draw the graph rather than for the actual vectorization.
E->UserTreeIndices.push_back(UserTreeIdx); E->UserTreeIndices.push_back(UserTreeIdx);
@ -1479,26 +1484,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
} }
// Check that every instruction appears once in this bundle. // Check that every instruction appears once in this bundle.
SmallVector<unsigned, 4> ReuseShuffleIndicies; for (unsigned i = 0, e = VL.size(); i < e; ++i)
SmallVector<Value *, 4> UniqueValues; for (unsigned j = i + 1; j < e; ++j)
DenseMap<Value *, unsigned> UniquePositions; if (VL[i] == VL[j]) {
for (Value *V : VL) {
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
ReuseShuffleIndicies.emplace_back(Res.first->second);
if (Res.second)
UniqueValues.emplace_back(V);
}
if (UniqueValues.size() == VL.size()) {
ReuseShuffleIndicies.clear();
} else {
DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
newTreeEntry(VL, false, UserTreeIdx); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
VL = UniqueValues;
}
auto &BSRef = BlocksSchedules[BB]; auto &BSRef = BlocksSchedules[BB];
if (!BSRef) if (!BSRef)
@ -1506,12 +1498,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BlockScheduling &BS = *BSRef.get(); BlockScheduling &BS = *BSRef.get();
if (!BS.tryScheduleBundle(VL, this, VL0)) { if (!BS.tryScheduleBundle(VL, this, S.OpValue)) {
DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
assert((!BS.getScheduleData(VL0) || assert((!BS.getScheduleData(VL0) ||
!BS.getScheduleData(VL0)->isPartOfBundle()) && !BS.getScheduleData(VL0)->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure"); "tryScheduleBundle should cancelScheduling on failure");
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@ -1530,12 +1522,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Term) { if (Term) {
DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
} }
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@ -1553,7 +1545,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::ExtractElement: { case Instruction::ExtractElement: {
bool Reuse = canReuseExtract(VL, VL0); bool Reuse = canReuseExtract(VL, VL0);
if (Reuse) { if (Reuse) {
DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
++NumOpsWantToKeepOrder[S.Opcode]; ++NumOpsWantToKeepOrder[S.Opcode];
} else { } else {
SmallVector<Value *, 4> ReverseVL(VL.rbegin(), VL.rend()); SmallVector<Value *, 4> ReverseVL(VL.rbegin(), VL.rend());
@ -1561,7 +1553,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
--NumOpsWantToKeepOrder[S.Opcode]; --NumOpsWantToKeepOrder[S.Opcode];
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
} }
newTreeEntry(VL, Reuse, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, Reuse, UserTreeIdx);
return; return;
} }
case Instruction::Load: { case Instruction::Load: {
@ -1576,7 +1568,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (DL->getTypeSizeInBits(ScalarTy) != if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) { DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return; return;
} }
@ -1587,7 +1579,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LoadInst *L = cast<LoadInst>(VL[i]); LoadInst *L = cast<LoadInst>(VL[i]);
if (!L->isSimple()) { if (!L->isSimple()) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return; return;
} }
@ -1609,7 +1601,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Consecutive) { if (Consecutive) {
++NumOpsWantToKeepOrder[S.Opcode]; ++NumOpsWantToKeepOrder[S.Opcode];
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of loads.\n"); DEBUG(dbgs() << "SLP: added a vector of loads.\n");
return; return;
} }
@ -1624,7 +1616,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
} }
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
if (ReverseConsecutive) { if (ReverseConsecutive) {
--NumOpsWantToKeepOrder[S.Opcode]; --NumOpsWantToKeepOrder[S.Opcode];
@ -1651,12 +1643,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) { if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
return; return;
} }
} }
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of casts.\n"); DEBUG(dbgs() << "SLP: added a vector of casts.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@ -1679,13 +1671,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Cmp->getPredicate() != P0 || if (Cmp->getPredicate() != P0 ||
Cmp->getOperand(0)->getType() != ComparedTy) { Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
return; return;
} }
} }
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of compares.\n"); DEBUG(dbgs() << "SLP: added a vector of compares.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@ -1717,7 +1709,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::And: case Instruction::And:
case Instruction::Or: case Instruction::Or:
case Instruction::Xor: case Instruction::Xor:
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
// Sort operands of the instructions so that each side is more likely to // Sort operands of the instructions so that each side is more likely to
@ -1746,7 +1738,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (cast<Instruction>(VL[j])->getNumOperands() != 2) { if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
} }
@ -1759,7 +1751,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Ty0 != CurTy) { if (Ty0 != CurTy) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
} }
@ -1771,12 +1763,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
DEBUG( DEBUG(
dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
} }
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
for (unsigned i = 0, e = 2; i < e; ++i) { for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands; ValueList Operands;
@ -1793,12 +1785,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return; return;
} }
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of stores.\n"); DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands; ValueList Operands;
@ -1816,7 +1808,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (!isTriviallyVectorizable(ID)) { if (!isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return; return;
} }
@ -1830,7 +1822,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
getVectorIntrinsicIDForCall(CI2, TLI) != ID || getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) { !CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
<< "\n"); << "\n");
return; return;
@ -1841,7 +1833,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Value *A1J = CI2->getArgOperand(1); Value *A1J = CI2->getArgOperand(1);
if (A1I != A1J) { if (A1I != A1J) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
<< " argument "<< A1I<<"!=" << A1J << " argument "<< A1I<<"!=" << A1J
<< "\n"); << "\n");
@ -1854,14 +1846,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CI->op_begin() + CI->getBundleOperandsEndIndex(), CI->op_begin() + CI->getBundleOperandsEndIndex(),
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
<< *VL[i] << '\n'); << *VL[i] << '\n');
return; return;
} }
} }
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, true, UserTreeIdx);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands; ValueList Operands;
// Prepare the operand vector. // Prepare the operand vector.
@ -1878,11 +1870,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// then do not vectorize this instruction. // then do not vectorize this instruction.
if (!S.IsAltShuffle) { if (!S.IsAltShuffle) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return; return;
} }
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization. // Reorder operands if reordering would enable vectorization.
@ -1906,7 +1898,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
default: default:
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return; return;
} }
@ -1999,22 +1991,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
VecTy = VectorType::get( VecTy = VectorType::get(
IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
int ReuseShuffleCost = 0;
if (NeedToShuffleReuses) {
ReuseShuffleCost =
TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
}
if (E->NeedToGather) { if (E->NeedToGather) {
if (allConstant(VL)) if (allConstant(VL))
return 0; return 0;
if (isSplat(VL)) { if (isSplat(VL)) {
return ReuseShuffleCost + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
} }
if (getSameOpcode(VL).Opcode == Instruction::ExtractElement && if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) {
allSameType(VL) && allSameBlock(VL)) {
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL); Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
if (ShuffleKind.hasValue()) { if (ShuffleKind.hasValue()) {
int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
@ -2031,10 +2014,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
IO->getZExtValue()); IO->getZExtValue());
} }
} }
return ReuseShuffleCost + Cost; return Cost;
} }
} }
return ReuseShuffleCost + getGatherCost(VL); return getGatherCost(E->Scalars);
} }
InstructionsState S = getSameOpcode(VL); InstructionsState S = getSameOpcode(VL);
assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
@ -2047,36 +2030,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::ExtractValue: case Instruction::ExtractValue:
case Instruction::ExtractElement: case Instruction::ExtractElement:
if (NeedToShuffleReuses) {
unsigned Idx = 0;
for (unsigned I : E->ReuseShuffleIndices) {
if (ShuffleOrOp == Instruction::ExtractElement) {
auto *IO = cast<ConstantInt>(
cast<ExtractElementInst>(VL[I])->getIndexOperand());
Idx = IO->getZExtValue();
ReuseShuffleCost -= TTI->getVectorInstrCost(
Instruction::ExtractElement, VecTy, Idx);
} else {
ReuseShuffleCost -= TTI->getVectorInstrCost(
Instruction::ExtractElement, VecTy, Idx);
++Idx;
}
}
Idx = ReuseShuffleNumbers;
for (Value *V : VL) {
if (ShuffleOrOp == Instruction::ExtractElement) {
auto *IO = cast<ConstantInt>(
cast<ExtractElementInst>(V)->getIndexOperand());
Idx = IO->getZExtValue();
} else {
--Idx;
}
ReuseShuffleCost +=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
}
}
if (canReuseExtract(VL, S.OpValue)) { if (canReuseExtract(VL, S.OpValue)) {
int DeadCost = ReuseShuffleCost; int DeadCost = 0;
for (unsigned i = 0, e = VL.size(); i < e; ++i) { for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *E = cast<Instruction>(VL[i]); Instruction *E = cast<Instruction>(VL[i]);
// If all users are going to be vectorized, instruction can be // If all users are going to be vectorized, instruction can be
@ -2084,12 +2039,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// The same, if have only one user, it will be vectorized for sure. // The same, if have only one user, it will be vectorized for sure.
if (areAllUsersVectorized(E)) if (areAllUsersVectorized(E))
// Take credit for instruction that will become dead. // Take credit for instruction that will become dead.
DeadCost -= DeadCost +=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
} }
return DeadCost; return -DeadCost;
} }
return ReuseShuffleCost + getGatherCost(VL); return getGatherCost(VecTy);
case Instruction::ZExt: case Instruction::ZExt:
case Instruction::SExt: case Instruction::SExt:
@ -2104,11 +2059,6 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::FPTrunc: case Instruction::FPTrunc:
case Instruction::BitCast: { case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType(); Type *SrcTy = VL0->getOperand(0)->getType();
if (NeedToShuffleReuses) {
ReuseShuffleCost -=
(ReuseShuffleNumbers - VL.size()) *
TTI->getCastInstrCost(S.Opcode, ScalarTy, SrcTy, VL0);
}
// Calculate the cost of this instruction. // Calculate the cost of this instruction.
int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
@ -2117,26 +2067,19 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
int VecCost = 0; int VecCost = 0;
// Check if the values are candidates to demote. // Check if the values are candidates to demote.
if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { if (!MinBWs.count(VL0) || VecTy != SrcVecTy)
VecCost = ReuseShuffleCost + VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
}
return VecCost - ScalarCost; return VecCost - ScalarCost;
} }
case Instruction::FCmp: case Instruction::FCmp:
case Instruction::ICmp: case Instruction::ICmp:
case Instruction::Select: { case Instruction::Select: {
// Calculate the cost of this instruction. // Calculate the cost of this instruction.
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
TTI->getCmpSelInstrCost(S.Opcode, ScalarTy,
Builder.getInt1Ty(), VL0);
}
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
int ScalarCost = VecTy->getNumElements() * int ScalarCost = VecTy->getNumElements() *
TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0); TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0); int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
return ReuseShuffleCost + VecCost - ScalarCost; return VecCost - ScalarCost;
} }
case Instruction::Add: case Instruction::Add:
case Instruction::FAdd: case Instruction::FAdd:
@ -2194,19 +2137,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VP = TargetTransformInfo::OP_PowerOf2;
SmallVector<const Value *, 4> Operands(VL0->operand_values()); SmallVector<const Value *, 4> Operands(VL0->operand_values());
if (NeedToShuffleReuses) {
ReuseShuffleCost -=
(ReuseShuffleNumbers - VL.size()) *
TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
Op2VP, Operands);
}
int ScalarCost = int ScalarCost =
VecTy->getNumElements() * VecTy->getNumElements() *
TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
Op2VP, Operands); Op2VP, Operands);
int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK, int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
Op1VP, Op2VP, Operands); Op1VP, Op2VP, Operands);
return ReuseShuffleCost + VecCost - ScalarCost; return VecCost - ScalarCost;
} }
case Instruction::GetElementPtr: { case Instruction::GetElementPtr: {
TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OperandValueKind Op1VK =
@ -2214,46 +2151,31 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_UniformConstantValue; TargetTransformInfo::OK_UniformConstantValue;
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
TTI->getArithmeticInstrCost(Instruction::Add,
ScalarTy, Op1VK, Op2VK);
}
int ScalarCost = int ScalarCost =
VecTy->getNumElements() * VecTy->getNumElements() *
TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
int VecCost = int VecCost =
TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
return ReuseShuffleCost + VecCost - ScalarCost; return VecCost - ScalarCost;
} }
case Instruction::Load: { case Instruction::Load: {
// Cost of wide load - cost of scalar loads. // Cost of wide load - cost of scalar loads.
unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment(); unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
TTI->getMemoryOpCost(Instruction::Load, ScalarTy,
alignment, 0, VL0);
}
int ScalarLdCost = VecTy->getNumElements() * int ScalarLdCost = VecTy->getNumElements() *
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
VecTy, alignment, 0, VL0); VecTy, alignment, 0, VL0);
return ReuseShuffleCost + VecLdCost - ScalarLdCost; return VecLdCost - ScalarLdCost;
} }
case Instruction::Store: { case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost. // We know that we can merge the stores. Calculate the cost.
unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment(); unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
TTI->getMemoryOpCost(Instruction::Store, ScalarTy,
alignment, 0, VL0);
}
int ScalarStCost = VecTy->getNumElements() * int ScalarStCost = VecTy->getNumElements() *
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
int VecStCost = TTI->getMemoryOpCost(Instruction::Store, int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
VecTy, alignment, 0, VL0); VecTy, alignment, 0, VL0);
return ReuseShuffleCost + VecStCost - ScalarStCost; return VecStCost - ScalarStCost;
} }
case Instruction::Call: { case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0); CallInst *CI = cast<CallInst>(VL0);
@ -2268,11 +2190,6 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
FMF = FPMO->getFastMathFlags(); FMF = FPMO->getFastMathFlags();
if (NeedToShuffleReuses) {
ReuseShuffleCost -=
(ReuseShuffleNumbers - VL.size()) *
TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
}
int ScalarCallCost = VecTy->getNumElements() * int ScalarCallCost = VecTy->getNumElements() *
TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
@ -2284,7 +2201,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
<< " (" << VecCallCost << "-" << ScalarCallCost << ")" << " (" << VecCallCost << "-" << ScalarCallCost << ")"
<< " for " << *CI << "\n"); << " for " << *CI << "\n");
return ReuseShuffleCost + VecCallCost - ScalarCallCost; return VecCallCost - ScalarCallCost;
} }
case Instruction::ShuffleVector: { case Instruction::ShuffleVector: {
TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OperandValueKind Op1VK =
@ -2292,22 +2209,6 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OK_AnyValue;
int ScalarCost = 0; int ScalarCost = 0;
if (NeedToShuffleReuses) {
for (unsigned Idx : E->ReuseShuffleIndices) {
Instruction *I = cast<Instruction>(VL[Idx]);
if (!I)
continue;
ReuseShuffleCost -= TTI->getArithmeticInstrCost(
I->getOpcode(), ScalarTy, Op1VK, Op2VK);
}
for (Value *V : VL) {
Instruction *I = cast<Instruction>(V);
if (!I)
continue;
ReuseShuffleCost += TTI->getArithmeticInstrCost(
I->getOpcode(), ScalarTy, Op1VK, Op2VK);
}
}
int VecCost = 0; int VecCost = 0;
for (Value *i : VL) { for (Value *i : VL) {
Instruction *I = cast<Instruction>(i); Instruction *I = cast<Instruction>(i);
@ -2326,7 +2227,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
VecCost += VecCost +=
TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
return ReuseShuffleCost + VecCost - ScalarCost; return VecCost - ScalarCost;
} }
default: default:
llvm_unreachable("Unknown instruction"); llvm_unreachable("Unknown instruction");
@ -2351,8 +2252,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree() {
return true; return true;
// Gathering cost would be too much for tiny trees. // Gathering cost would be too much for tiny trees.
if ((VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) && if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
VectorizableTree[0].Scalars.size() < 4)
return false; return false;
return true; return true;
@ -2503,14 +2403,10 @@ int BoUpSLP::getTreeCost() {
return Cost; return Cost;
} }
int BoUpSLP::getGatherCost(Type *Ty, int BoUpSLP::getGatherCost(Type *Ty) {
const DenseSet<unsigned> &ShuffledIndices) {
int Cost = 0; int Cost = 0;
for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
if (!ShuffledIndices.count(i))
Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
if (!ShuffledIndices.empty())
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost; return Cost;
} }
@ -2521,17 +2417,7 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
ScalarTy = SI->getValueOperand()->getType(); ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
// Find the cost of inserting/extracting values from the vector. // Find the cost of inserting/extracting values from the vector.
// Check if the same elements are inserted several times and count them as return getGatherCost(VecTy);
// shuffle candidates.
DenseSet<unsigned> ShuffledElements;
DenseSet<Value *> UniqueElements;
// Iterate in reverse order to consider insert elements with the high cost.
for (unsigned I = VL.size(); I > 0; --I) {
unsigned Idx = I - 1;
if (!UniqueElements.insert(VL[Idx]).second)
ShuffledElements.insert(Idx);
}
return getGatherCost(VecTy, ShuffledElements);
} }
// Reorder commutative operations in alternate shuffle if the resulting vectors // Reorder commutative operations in alternate shuffle if the resulting vectors
@ -2829,7 +2715,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
if (TreeEntry *E = getTreeEntry(VL[i])) { if (TreeEntry *E = getTreeEntry(VL[i])) {
// Find which lane we need to extract. // Find which lane we need to extract.
int FoundLane = -1; int FoundLane = -1;
for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) { for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
// Is this the lane of the scalar that we are looking for ? // Is this the lane of the scalar that we are looking for ?
if (E->Scalars[Lane] == VL[i]) { if (E->Scalars[Lane] == VL[i]) {
FoundLane = Lane; FoundLane = Lane;
@ -2845,6 +2731,14 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
return Vec; return Vec;
} }
Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
if (const TreeEntry *En = getTreeEntry(OpValue)) {
if (En->isSame(VL) && En->VectorizedValue)
return En->VectorizedValue;
}
return nullptr;
}
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
InstructionsState S = getSameOpcode(VL); InstructionsState S = getSameOpcode(VL);
if (S.Opcode) { if (S.Opcode) {
@ -2857,38 +2751,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
Type *ScalarTy = S.OpValue->getType(); Type *ScalarTy = S.OpValue->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
ScalarTy = SI->getValueOperand()->getType(); ScalarTy = SI->getValueOperand()->getType();
// Check that every instruction appears once in this bundle.
SmallVector<unsigned, 4> ReuseShuffleIndicies;
SmallVector<Value *, 4> UniqueValues;
if (VL.size() > 2) {
DenseMap<Value *, unsigned> UniquePositions;
for (Value *V : VL) {
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
ReuseShuffleIndicies.emplace_back(Res.first->second);
if (Res.second || isa<Constant>(V))
UniqueValues.emplace_back(V);
}
// Do not shuffle single element or if number of unique values is not power
// of 2.
if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
!llvm::isPowerOf2_32(UniqueValues.size()))
ReuseShuffleIndicies.clear();
else
VL = UniqueValues;
}
VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
Value *V = Gather(VL, VecTy); return Gather(VL, VecTy);
if (!ReuseShuffleIndicies.empty()) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
ReuseShuffleIndicies, "shuffle");
if (auto *I = dyn_cast<Instruction>(V)) {
GatherSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
return V;
} }
Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
@ -2906,19 +2771,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ScalarTy = SI->getValueOperand()->getType(); ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
if (E->NeedToGather) { if (E->NeedToGather) {
setInsertPointAfterBundle(E->Scalars, VL0); setInsertPointAfterBundle(E->Scalars, VL0);
auto *V = Gather(E->Scalars, VecTy); auto *V = Gather(E->Scalars, VecTy);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
if (auto *I = dyn_cast<Instruction>(V)) {
GatherSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
E->VectorizedValue = V; E->VectorizedValue = V;
return V; return V;
} }
@ -2931,12 +2786,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
Builder.SetCurrentDebugLocation(PH->getDebugLoc()); Builder.SetCurrentDebugLocation(PH->getDebugLoc());
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
Value *V = NewPhi; E->VectorizedValue = NewPhi;
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V;
// PHINodes may have multiple entries from the same block. We want to // PHINodes may have multiple entries from the same block. We want to
// visit every block once. // visit every block once.
@ -2963,30 +2813,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
"Invalid number of incoming values"); "Invalid number of incoming values");
return V; return NewPhi;
} }
case Instruction::ExtractElement: { case Instruction::ExtractElement: {
if (canReuseExtract(E->Scalars, VL0)) { if (canReuseExtract(E->Scalars, VL0)) {
Value *V = VL0->getOperand(0); Value *V = VL0->getOperand(0);
if (NeedToShuffleReuses) {
Builder.SetInsertPoint(VL0);
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V; E->VectorizedValue = V;
return V; return V;
} }
setInsertPointAfterBundle(E->Scalars, VL0); setInsertPointAfterBundle(E->Scalars, VL0);
auto *V = Gather(E->Scalars, VecTy); auto *V = Gather(E->Scalars, VecTy);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
if (auto *I = dyn_cast<Instruction>(V)) {
GatherSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
E->VectorizedValue = V; E->VectorizedValue = V;
return V; return V;
} }
@ -2997,24 +2834,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
Value *NewV = propagateMetadata(V, E->Scalars); E->VectorizedValue = V;
if (NeedToShuffleReuses) { return propagateMetadata(V, E->Scalars);
NewV = Builder.CreateShuffleVector(
NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = NewV;
return NewV;
} }
setInsertPointAfterBundle(E->Scalars, VL0); setInsertPointAfterBundle(E->Scalars, VL0);
auto *V = Gather(E->Scalars, VecTy); auto *V = Gather(E->Scalars, VecTy);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
if (auto *I = dyn_cast<Instruction>(V)) {
GatherSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
E->VectorizedValue = V; E->VectorizedValue = V;
return V; return V;
} }
@ -3038,17 +2862,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *InVec = vectorizeTree(INVL); Value *InVec = vectorizeTree(INVL);
if (E->VectorizedValue) { if (Value *V = alreadyVectorized(E->Scalars, VL0))
DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return V;
return E->VectorizedValue;
}
CastInst *CI = dyn_cast<CastInst>(VL0); CastInst *CI = dyn_cast<CastInst>(VL0);
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V; E->VectorizedValue = V;
++NumVectorInstructions; ++NumVectorInstructions;
return V; return V;
@ -3066,10 +2884,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *L = vectorizeTree(LHSV); Value *L = vectorizeTree(LHSV);
Value *R = vectorizeTree(RHSV); Value *R = vectorizeTree(RHSV);
if (E->VectorizedValue) { if (Value *V = alreadyVectorized(E->Scalars, VL0))
DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return V;
return E->VectorizedValue;
}
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V; Value *V;
@ -3078,12 +2894,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
else else
V = Builder.CreateICmp(P0, L, R); V = Builder.CreateICmp(P0, L, R);
propagateIRFlags(V, E->Scalars, VL0);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V; E->VectorizedValue = V;
propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions; ++NumVectorInstructions;
return V; return V;
} }
@ -3101,16 +2913,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *True = vectorizeTree(TrueVec); Value *True = vectorizeTree(TrueVec);
Value *False = vectorizeTree(FalseVec); Value *False = vectorizeTree(FalseVec);
if (E->VectorizedValue) { if (Value *V = alreadyVectorized(E->Scalars, VL0))
DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return V;
return E->VectorizedValue;
}
Value *V = Builder.CreateSelect(Cond, True, False); Value *V = Builder.CreateSelect(Cond, True, False);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V; E->VectorizedValue = V;
++NumVectorInstructions; ++NumVectorInstructions;
return V; return V;
@ -3149,24 +2955,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *LHS = vectorizeTree(LHSVL); Value *LHS = vectorizeTree(LHSVL);
Value *RHS = vectorizeTree(RHSVL); Value *RHS = vectorizeTree(RHSVL);
if (E->VectorizedValue) { if (Value *V = alreadyVectorized(E->Scalars, VL0))
DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return V;
return E->VectorizedValue;
}
Value *V = Builder.CreateBinOp( Value *V = Builder.CreateBinOp(
static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS); static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
propagateIRFlags(V, E->Scalars, VL0);
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V; E->VectorizedValue = V;
propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions; ++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
return propagateMetadata(I, E->Scalars);
return V; return V;
} }
case Instruction::Load: { case Instruction::Load: {
@ -3194,14 +2994,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Alignment = DL->getABITypeAlignment(ScalarLoadTy); Alignment = DL->getABITypeAlignment(ScalarLoadTy);
} }
LI->setAlignment(Alignment); LI->setAlignment(Alignment);
Value *V = propagateMetadata(LI, E->Scalars); E->VectorizedValue = LI;
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V;
++NumVectorInstructions; ++NumVectorInstructions;
return V; return propagateMetadata(LI, E->Scalars);
} }
case Instruction::Store: { case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(VL0); StoreInst *SI = cast<StoreInst>(VL0);
@ -3229,14 +3024,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
S->setAlignment(Alignment); S->setAlignment(Alignment);
Value *V = propagateMetadata(S, E->Scalars); E->VectorizedValue = S;
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V;
++NumVectorInstructions; ++NumVectorInstructions;
return V; return propagateMetadata(S, E->Scalars);
} }
case Instruction::GetElementPtr: { case Instruction::GetElementPtr: {
setInsertPointAfterBundle(E->Scalars, VL0); setInsertPointAfterBundle(E->Scalars, VL0);
@ -3260,16 +3050,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *V = Builder.CreateGEP( Value *V = Builder.CreateGEP(
cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs); cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
if (Instruction *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V; E->VectorizedValue = V;
++NumVectorInstructions; ++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
return propagateMetadata(I, E->Scalars);
return V; return V;
} }
case Instruction::Call: { case Instruction::Call: {
@ -3316,12 +3102,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (ScalarArg && getTreeEntry(ScalarArg)) if (ScalarArg && getTreeEntry(ScalarArg))
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
propagateIRFlags(V, E->Scalars, VL0);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V; E->VectorizedValue = V;
propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions; ++NumVectorInstructions;
return V; return V;
} }
@ -3335,10 +3117,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *LHS = vectorizeTree(LHSVL); Value *LHS = vectorizeTree(LHSVL);
Value *RHS = vectorizeTree(RHSVL); Value *RHS = vectorizeTree(RHSVL);
if (E->VectorizedValue) { if (Value *V = alreadyVectorized(E->Scalars, VL0))
DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return V;
return E->VectorizedValue;
}
// Create a vector of LHS op1 RHS // Create a vector of LHS op1 RHS
Value *V0 = Builder.CreateBinOp( Value *V0 = Builder.CreateBinOp(
@ -3370,14 +3150,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
propagateIRFlags(V1, OddScalars); propagateIRFlags(V1, OddScalars);
Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
if (Instruction *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V; E->VectorizedValue = V;
++NumVectorInstructions; ++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
return propagateMetadata(I, E->Scalars);
return V; return V;
} }
@ -3547,12 +3323,14 @@ void BoUpSLP::optimizeGatherSequence() {
DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
<< " gather sequences instructions.\n"); << " gather sequences instructions.\n");
// LICM InsertElementInst sequences. // LICM InsertElementInst sequences.
for (Instruction *I : GatherSeq) { for (Instruction *it : GatherSeq) {
if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I)) InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
if (!Insert)
continue; continue;
// Check if this block is inside a loop. // Check if this block is inside a loop.
Loop *L = LI->getLoopFor(I->getParent()); Loop *L = LI->getLoopFor(Insert->getParent());
if (!L) if (!L)
continue; continue;
@ -3564,15 +3342,15 @@ void BoUpSLP::optimizeGatherSequence() {
// If the vector or the element that we insert into it are // If the vector or the element that we insert into it are
// instructions that are defined in this basic block then we can't // instructions that are defined in this basic block then we can't
// hoist this instruction. // hoist this instruction.
auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
if (Op0 && L->contains(Op0)) if (CurrVec && L->contains(CurrVec))
continue; continue;
if (Op1 && L->contains(Op1)) if (NewElem && L->contains(NewElem))
continue; continue;
// We can hoist this instruction. Move it to the pre-header. // We can hoist this instruction. Move it to the pre-header.
I->moveBefore(PreHeader->getTerminator()); Insert->moveBefore(PreHeader->getTerminator());
} }
// Make a list of all reachable blocks in our CSE queue. // Make a list of all reachable blocks in our CSE queue.

View File

@ -4,14 +4,15 @@
define void @i64_simplified(i64* noalias %st, i64* noalias %ld) { define void @i64_simplified(i64* noalias %st, i64* noalias %ld) {
; CHECK-LABEL: @i64_simplified( ; CHECK-LABEL: @i64_simplified(
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[T0:%.*]] = load i64, i64* [[LD]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 ; CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>* ; CHECK-NEXT: store i64 [[T0]], i64* [[ST]], align 8
; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: store i64 [[T1]], i64* [[ARRAYIDX3]], align 8
; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX4]], align 8
; CHECK-NEXT: store i64 [[T1]], i64* [[ARRAYIDX5]], align 8
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
%arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1 %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1

View File

@ -137,19 +137,17 @@ define i8 @k(<4 x i8> %x) {
define i8 @k_bb(<4 x i8> %x) { define i8 @k_bb(<4 x i8> %x) {
; CHECK-LABEL: @k_bb( ; CHECK-LABEL: @k_bb(
; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1: ; CHECK: bb1:
; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X]], [[X]] ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>
; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X0X0]], [[X3X3]] ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
; CHECK-NEXT: [[TMP6:%.*]] = sdiv i8 [[TMP2]], [[TMP5]] ; CHECK-NEXT: ret i8 [[TMP8]]
; CHECK-NEXT: ret i8 [[TMP6]]
; ;
%x0 = extractelement <4 x i8> %x, i32 0 %x0 = extractelement <4 x i8> %x, i32 0
br label %bb1 br label %bb1

View File

@ -16,18 +16,19 @@ target triple = "i386-apple-macosx10.9.0"
define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) { define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) {
; CHECK-LABEL: @foo( ; CHECK-LABEL: @foo(
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[N:%.*]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[K:%.*]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[K:%.*]], i32 1
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[N]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[K]], i32 3
; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body: ; CHECK: for.body:
; CHECK-NEXT: [[I_024:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[I_024:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_024]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_024]]
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[SHUFFLE]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP5]]
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4
; CHECK-NEXT: [[ADD10]] = add nsw i32 [[I_024]], 4 ; CHECK-NEXT: [[ADD10]] = add nsw i32 [[I_024]], 4
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[ADD10]], 10000 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[ADD10]], 10000
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]