From 0affccc8d7102ccea17b26479461455e48d7dcae Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 24 Jan 2018 18:36:51 +0000 Subject: [PATCH] Revert "[SLP] Fix for PR32086: Count InsertElementInstr of the same elements as shuffle." This reverts commit r323348 because of the broken buildbots. llvm-svn: 323359 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 468 +++++------------- .../Transforms/SLPVectorizer/X86/PR32086.ll | 11 +- .../SLPVectorizer/X86/blending-shuffle.ll | 20 +- .../Transforms/SLPVectorizer/X86/hoist.ll | 17 +- 4 files changed, 146 insertions(+), 370 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index bd523e32f08a..f748ba4b31b4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -662,9 +662,13 @@ private: /// Vectorize a single entry in the tree, starting in \p VL. Value *vectorizeTree(ArrayRef VL); + /// \returns the pointer to the vectorized value if \p VL is already + /// vectorized, or NULL. They may happen in cycles. + Value *alreadyVectorized(ArrayRef VL, Value *OpValue) const; + /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. - int getGatherCost(Type *Ty, const DenseSet &ShuffledIndices); + int getGatherCost(Type *Ty); /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the @@ -698,12 +702,8 @@ private: /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef VL) const { - if (VL.size() == Scalars.size()) - return std::equal(VL.begin(), VL.end(), Scalars.begin()); - assert(VL.size() == ReuseShuffleIndices.size() && "Invalid size"); - return std::equal( - VL.begin(), VL.end(), ReuseShuffleIndices.begin(), - [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; }); + assert(VL.size() == Scalars.size() && "Invalid size"); + return std::equal(VL.begin(), VL.end(), Scalars.begin()); } /// A vector of scalars. @@ -715,9 +715,6 @@ private: /// Do we need to gather this sequence ? bool NeedToGather = false; - /// Does this sequence require some shuffling? - SmallVector ReuseShuffleIndices; - /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has @@ -732,15 +729,13 @@ private: }; /// Create a new VectorizableTree entry. - void newTreeEntry(ArrayRef VL, bool Vectorized, int &UserTreeIdx, - ArrayRef ReuseShuffleIndices = None) { + TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized, + int &UserTreeIdx) { VectorizableTree.emplace_back(VectorizableTree); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; - Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), - ReuseShuffleIndices.end()); if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); @@ -753,6 +748,7 @@ private: if (UserTreeIdx >= 0) Last->UserTreeIndices.push_back(UserTreeIdx); UserTreeIdx = idx; + return Last; } /// -- Vectorization State -- @@ -766,6 +762,13 @@ private: return nullptr; } + const TreeEntry *getTreeEntry(Value *V) const { + auto I = ScalarToTreeEntry.find(V); + if (I != ScalarToTreeEntry.end()) + return &VectorizableTree[I->second]; + return nullptr; + } + /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; @@ -1481,26 +1484,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } // Check that every instruction appears once in this bundle. - SmallVector ReuseShuffleIndicies; - SmallVector UniqueValues; - DenseMap UniquePositions; - for (Value *V : VL) { - auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); - ReuseShuffleIndicies.emplace_back(Res.first->second); - if (Res.second) - UniqueValues.emplace_back(V); - } - if (UniqueValues.size() == VL.size()) { - ReuseShuffleIndicies.clear(); - } else { - DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { - DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); - return; - } - VL = UniqueValues; - } + for (unsigned i = 0, e = VL.size(); i < e; ++i) + for (unsigned j = i + 1; j < e; ++j) + if (VL[i] == VL[j]) { + DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + newTreeEntry(VL, false, UserTreeIdx); + return; + } auto &BSRef = BlocksSchedules[BB]; if (!BSRef) @@ -1508,12 +1498,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, this, VL0)) { + if (!BS.tryScheduleBundle(VL, this, S.OpValue)) { DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); return; } DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1532,12 +1522,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1555,7 +1545,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, case Instruction::ExtractElement: { bool Reuse = canReuseExtract(VL, VL0); if (Reuse) { - DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); + DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); ++NumOpsWantToKeepOrder[S.Opcode]; } else { SmallVector ReverseVL(VL.rbegin(), VL.rend()); @@ -1563,7 +1553,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, --NumOpsWantToKeepOrder[S.Opcode]; BS.cancelScheduling(VL, VL0); } - newTreeEntry(VL, Reuse, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, Reuse, UserTreeIdx); return; } case Instruction::Load: { @@ -1578,7 +1568,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1589,7 +1579,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, LoadInst *L = cast(VL[i]); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1611,7 +1601,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (Consecutive) { ++NumOpsWantToKeepOrder[S.Opcode]; - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; } @@ -1626,7 +1616,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); if (ReverseConsecutive) { --NumOpsWantToKeepOrder[S.Opcode]; @@ -1653,12 +1643,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Type *Ty = cast(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1681,13 +1671,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1719,7 +1709,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, case Instruction::And: case Instruction::Or: case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1748,7 +1738,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (cast(VL[j])->getNumOperands() != 2) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1761,7 +1751,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (Ty0 != CurTy) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1773,12 +1763,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, DEBUG( dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1795,12 +1785,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -1818,7 +1808,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1832,7 +1822,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1843,7 +1833,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument "<< A1I<<"!=" << A1J << "\n"); @@ -1856,14 +1846,14 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1880,11 +1870,11 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // then do not vectorize this instruction. if (!S.IsAltShuffle) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. @@ -1908,7 +1898,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -2001,22 +1991,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { VecTy = VectorType::get( IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); - unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); - bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - int ReuseShuffleCost = 0; - if (NeedToShuffleReuses) { - ReuseShuffleCost = - TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); - } if (E->NeedToGather) { if (allConstant(VL)) return 0; if (isSplat(VL)) { - return ReuseShuffleCost + - TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } - if (getSameOpcode(VL).Opcode == Instruction::ExtractElement && - allSameType(VL) && allSameBlock(VL)) { + if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) { Optional ShuffleKind = isShuffle(VL); if (ShuffleKind.hasValue()) { int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); @@ -2033,10 +2014,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { IO->getZExtValue()); } } - return ReuseShuffleCost + Cost; + return Cost; } } - return ReuseShuffleCost + getGatherCost(VL); + return getGatherCost(E->Scalars); } InstructionsState S = getSameOpcode(VL); assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); @@ -2049,36 +2030,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::ExtractValue: case Instruction::ExtractElement: - if (NeedToShuffleReuses) { - unsigned Idx = 0; - for (unsigned I : E->ReuseShuffleIndices) { - if (ShuffleOrOp == Instruction::ExtractElement) { - auto *IO = cast( - cast(VL[I])->getIndexOperand()); - Idx = IO->getZExtValue(); - ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); - } else { - ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); - ++Idx; - } - } - Idx = ReuseShuffleNumbers; - for (Value *V : VL) { - if (ShuffleOrOp == Instruction::ExtractElement) { - auto *IO = cast( - cast(V)->getIndexOperand()); - Idx = IO->getZExtValue(); - } else { - --Idx; - } - ReuseShuffleCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); - } - } if (canReuseExtract(VL, S.OpValue)) { - int DeadCost = ReuseShuffleCost; + int DeadCost = 0; for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *E = cast(VL[i]); // If all users are going to be vectorized, instruction can be @@ -2086,12 +2039,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // The same, if have only one user, it will be vectorized for sure. if (areAllUsersVectorized(E)) // Take credit for instruction that will become dead. - DeadCost -= + DeadCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); } - return DeadCost; + return -DeadCost; } - return ReuseShuffleCost + getGatherCost(VL); + return getGatherCost(VecTy); case Instruction::ZExt: case Instruction::SExt: @@ -2106,11 +2059,6 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); - if (NeedToShuffleReuses) { - ReuseShuffleCost -= - (ReuseShuffleNumbers - VL.size()) * - TTI->getCastInstrCost(S.Opcode, ScalarTy, SrcTy, VL0); - } // Calculate the cost of this instruction. int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), @@ -2119,26 +2067,19 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); int VecCost = 0; // Check if the values are candidates to demote. - if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { - VecCost = ReuseShuffleCost + - TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0); - } + if (!MinBWs.count(VL0) || VecTy != SrcVecTy) + VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0); return VecCost - ScalarCost; } case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: { // Calculate the cost of this instruction. - if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * - TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, - Builder.getInt1Ty(), VL0); - } VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0); int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0); - return ReuseShuffleCost + VecCost - ScalarCost; + return VecCost - ScalarCost; } case Instruction::Add: case Instruction::FAdd: @@ -2196,19 +2137,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { Op2VP = TargetTransformInfo::OP_PowerOf2; SmallVector Operands(VL0->operand_values()); - if (NeedToShuffleReuses) { - ReuseShuffleCost -= - (ReuseShuffleNumbers - VL.size()) * - TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, - Op2VP, Operands); - } int ScalarCost = VecTy->getNumElements() * TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); - return ReuseShuffleCost + VecCost - ScalarCost; + return VecCost - ScalarCost; } case Instruction::GetElementPtr: { TargetTransformInfo::OperandValueKind Op1VK = @@ -2216,46 +2151,31 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_UniformConstantValue; - if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * - TTI->getArithmeticInstrCost(Instruction::Add, - ScalarTy, Op1VK, Op2VK); - } int ScalarCost = VecTy->getNumElements() * TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); int VecCost = TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); - return ReuseShuffleCost + VecCost - ScalarCost; + return VecCost - ScalarCost; } case Instruction::Load: { // Cost of wide load - cost of scalar loads. unsigned alignment = dyn_cast(VL0)->getAlignment(); - if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, - alignment, 0, VL0); - } int ScalarLdCost = VecTy->getNumElements() * TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); - return ReuseShuffleCost + VecLdCost - ScalarLdCost; + return VecLdCost - ScalarLdCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. unsigned alignment = dyn_cast(VL0)->getAlignment(); - if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, - alignment, 0, VL0); - } int ScalarStCost = VecTy->getNumElements() * TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0); - return ReuseShuffleCost + VecStCost - ScalarStCost; + return VecStCost - ScalarStCost; } case Instruction::Call: { CallInst *CI = cast(VL0); @@ -2270,11 +2190,6 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { if (auto *FPMO = dyn_cast(CI)) FMF = FPMO->getFastMathFlags(); - if (NeedToShuffleReuses) { - ReuseShuffleCost -= - (ReuseShuffleNumbers - VL.size()) * - TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); - } int ScalarCallCost = VecTy->getNumElements() * TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); @@ -2286,7 +2201,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { << " (" << VecCallCost << "-" << ScalarCallCost << ")" << " for " << *CI << "\n"); - return ReuseShuffleCost + VecCallCost - ScalarCallCost; + return VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { TargetTransformInfo::OperandValueKind Op1VK = @@ -2294,22 +2209,6 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; int ScalarCost = 0; - if (NeedToShuffleReuses) { - for (unsigned Idx : E->ReuseShuffleIndices) { - Instruction *I = cast(VL[Idx]); - if (!I) - continue; - ReuseShuffleCost -= TTI->getArithmeticInstrCost( - I->getOpcode(), ScalarTy, Op1VK, Op2VK); - } - for (Value *V : VL) { - Instruction *I = cast(V); - if (!I) - continue; - ReuseShuffleCost += TTI->getArithmeticInstrCost( - I->getOpcode(), ScalarTy, Op1VK, Op2VK); - } - } int VecCost = 0; for (Value *i : VL) { Instruction *I = cast(i); @@ -2328,7 +2227,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); - return ReuseShuffleCost + VecCost - ScalarCost; + return VecCost - ScalarCost; } default: llvm_unreachable("Unknown instruction"); @@ -2353,8 +2252,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree() { return true; // Gathering cost would be too much for tiny trees. - if ((VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) && - VectorizableTree[0].Scalars.size() < 4) + if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) return false; return true; @@ -2505,14 +2403,10 @@ int BoUpSLP::getTreeCost() { return Cost; } -int BoUpSLP::getGatherCost(Type *Ty, - const DenseSet &ShuffledIndices) { +int BoUpSLP::getGatherCost(Type *Ty) { int Cost = 0; for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) - if (!ShuffledIndices.count(i)) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); - if (!ShuffledIndices.empty()) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); return Cost; } @@ -2523,17 +2417,7 @@ int BoUpSLP::getGatherCost(ArrayRef VL) { ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); // Find the cost of inserting/extracting values from the vector. - // Check if the same elements are inserted several times and count them as - // shuffle candidates. - DenseSet ShuffledElements; - DenseSet UniqueElements; - // Iterate in reverse order to consider insert elements with the high cost. - for (unsigned I = VL.size(); I > 0; --I) { - unsigned Idx = I - 1; - if (!UniqueElements.insert(VL[Idx]).second) - ShuffledElements.insert(Idx); - } - return getGatherCost(VecTy, ShuffledElements); + return getGatherCost(VecTy); } // Reorder commutative operations in alternate shuffle if the resulting vectors @@ -2847,6 +2731,14 @@ Value *BoUpSLP::Gather(ArrayRef VL, VectorType *Ty) { return Vec; } +Value *BoUpSLP::alreadyVectorized(ArrayRef VL, Value *OpValue) const { + if (const TreeEntry *En = getTreeEntry(OpValue)) { + if (En->isSame(VL) && En->VectorizedValue) + return En->VectorizedValue; + } + return nullptr; +} + Value *BoUpSLP::vectorizeTree(ArrayRef VL) { InstructionsState S = getSameOpcode(VL); if (S.Opcode) { @@ -2859,38 +2751,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { Type *ScalarTy = S.OpValue->getType(); if (StoreInst *SI = dyn_cast(S.OpValue)) ScalarTy = SI->getValueOperand()->getType(); - - // Check that every instruction appears once in this bundle. - SmallVector ReuseShuffleIndicies; - SmallVector UniqueValues; - if (VL.size() > 2) { - DenseMap UniquePositions; - for (Value *V : VL) { - auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); - ReuseShuffleIndicies.emplace_back(Res.first->second); - if (Res.second || isa(V)) - UniqueValues.emplace_back(V); - } - // Do not shuffle single element or if number of unique values is not power - // of 2. - if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 || - !llvm::isPowerOf2_32(UniqueValues.size())) - ReuseShuffleIndicies.clear(); - else - VL = UniqueValues; - } VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - Value *V = Gather(VL, VecTy); - if (!ReuseShuffleIndicies.empty()) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - ReuseShuffleIndicies, "shuffle"); - if (auto *I = dyn_cast(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - } - return V; + return Gather(VL, VecTy); } Value *BoUpSLP::vectorizeTree(TreeEntry *E) { @@ -2908,19 +2771,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); - bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - if (E->NeedToGather) { setInsertPointAfterBundle(E->Scalars, VL0); auto *V = Gather(E->Scalars, VecTy); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - } E->VectorizedValue = V; return V; } @@ -2933,12 +2786,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); - Value *V = NewPhi; - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = V; + E->VectorizedValue = NewPhi; // PHINodes may have multiple entries from the same block. We want to // visit every block once. @@ -2965,30 +2813,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && "Invalid number of incoming values"); - return V; + return NewPhi; } case Instruction::ExtractElement: { if (canReuseExtract(E->Scalars, VL0)) { Value *V = VL0->getOperand(0); - if (NeedToShuffleReuses) { - Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } E->VectorizedValue = V; return V; } setInsertPointAfterBundle(E->Scalars, VL0); auto *V = Gather(E->Scalars, VecTy); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - } E->VectorizedValue = V; return V; } @@ -2999,24 +2834,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); - Value *NewV = propagateMetadata(V, E->Scalars); - if (NeedToShuffleReuses) { - NewV = Builder.CreateShuffleVector( - NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = NewV; - return NewV; + E->VectorizedValue = V; + return propagateMetadata(V, E->Scalars); } setInsertPointAfterBundle(E->Scalars, VL0); auto *V = Gather(E->Scalars, VecTy); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - } E->VectorizedValue = V; return V; } @@ -3040,17 +2862,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *InVec = vectorizeTree(INVL); - if (E->VectorizedValue) { - DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + if (Value *V = alreadyVectorized(E->Scalars, VL0)) + return V; CastInst *CI = dyn_cast(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -3068,10 +2884,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *L = vectorizeTree(LHSV); Value *R = vectorizeTree(RHSV); - if (E->VectorizedValue) { - DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + if (Value *V = alreadyVectorized(E->Scalars, VL0)) + return V; CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V; @@ -3080,12 +2894,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { else V = Builder.CreateICmp(P0, L, R); - propagateIRFlags(V, E->Scalars, VL0); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } E->VectorizedValue = V; + propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; return V; } @@ -3103,16 +2913,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *True = vectorizeTree(TrueVec); Value *False = vectorizeTree(FalseVec); - if (E->VectorizedValue) { - DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + if (Value *V = alreadyVectorized(E->Scalars, VL0)) + return V; Value *V = Builder.CreateSelect(Cond, True, False); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -3151,24 +2955,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); - if (E->VectorizedValue) { - DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + if (Value *V = alreadyVectorized(E->Scalars, VL0)) + return V; Value *V = Builder.CreateBinOp( static_cast(S.Opcode), LHS, RHS); - propagateIRFlags(V, E->Scalars, VL0); - if (auto *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); - - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } E->VectorizedValue = V; + propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; + if (Instruction *I = dyn_cast(V)) + return propagateMetadata(I, E->Scalars); + return V; } case Instruction::Load: { @@ -3196,14 +2994,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Alignment = DL->getABITypeAlignment(ScalarLoadTy); } LI->setAlignment(Alignment); - Value *V = propagateMetadata(LI, E->Scalars); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = V; + E->VectorizedValue = LI; ++NumVectorInstructions; - return V; + return propagateMetadata(LI, E->Scalars); } case Instruction::Store: { StoreInst *SI = cast(VL0); @@ -3231,14 +3024,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); S->setAlignment(Alignment); - Value *V = propagateMetadata(S, E->Scalars); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = V; + E->VectorizedValue = S; ++NumVectorInstructions; - return V; + return propagateMetadata(S, E->Scalars); } case Instruction::GetElementPtr: { setInsertPointAfterBundle(E->Scalars, VL0); @@ -3262,16 +3050,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *V = Builder.CreateGEP( cast(VL0)->getSourceElementType(), Op0, OpVecs); - if (Instruction *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); - - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } E->VectorizedValue = V; ++NumVectorInstructions; + if (Instruction *I = dyn_cast(V)) + return propagateMetadata(I, E->Scalars); + return V; } case Instruction::Call: { @@ -3318,12 +3102,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (ScalarArg && getTreeEntry(ScalarArg)) ExternalUses.push_back(ExternalUser(ScalarArg, cast(V), 0)); - propagateIRFlags(V, E->Scalars, VL0); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } E->VectorizedValue = V; + propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; return V; } @@ -3337,10 +3117,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); - if (E->VectorizedValue) { - DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + if (Value *V = alreadyVectorized(E->Scalars, VL0)) + return V; // Create a vector of LHS op1 RHS Value *V0 = Builder.CreateBinOp( @@ -3372,14 +3150,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { propagateIRFlags(V1, OddScalars); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); - if (Instruction *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } E->VectorizedValue = V; ++NumVectorInstructions; + if (Instruction *I = dyn_cast(V)) + return propagateMetadata(I, E->Scalars); return V; } @@ -3549,12 +3323,14 @@ void BoUpSLP::optimizeGatherSequence() { DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. - for (Instruction *I : GatherSeq) { - if (!isa(I) && !isa(I)) + for (Instruction *it : GatherSeq) { + InsertElementInst *Insert = dyn_cast(it); + + if (!Insert) continue; // Check if this block is inside a loop. - Loop *L = LI->getLoopFor(I->getParent()); + Loop *L = LI->getLoopFor(Insert->getParent()); if (!L) continue; @@ -3566,15 +3342,15 @@ void BoUpSLP::optimizeGatherSequence() { // If the vector or the element that we insert into it are // instructions that are defined in this basic block then we can't // hoist this instruction. - auto *Op0 = dyn_cast(I->getOperand(0)); - auto *Op1 = dyn_cast(I->getOperand(1)); - if (Op0 && L->contains(Op0)) + Instruction *CurrVec = dyn_cast(Insert->getOperand(0)); + Instruction *NewElem = dyn_cast(Insert->getOperand(1)); + if (CurrVec && L->contains(CurrVec)) continue; - if (Op1 && L->contains(Op1)) + if (NewElem && L->contains(NewElem)) continue; // We can hoist this instruction. Move it to the pre-header. - I->moveBefore(PreHeader->getTerminator()); + Insert->moveBefore(PreHeader->getTerminator()); } // Make a list of all reachable blocks in our CSE queue. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll index 7613fadb5ab9..ee9ee7c34dea 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll @@ -4,14 +4,15 @@ define void @i64_simplified(i64* noalias %st, i64* noalias %ld) { ; CHECK-LABEL: @i64_simplified( ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[T0:%.*]] = load i64, i64* [[LD]], align 8 +; CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>* -; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: store i64 [[T0]], i64* [[ST]], align 8 +; CHECK-NEXT: store i64 [[T1]], i64* [[ARRAYIDX3]], align 8 +; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX4]], align 8 +; CHECK-NEXT: store i64 [[T1]], i64* [[ARRAYIDX5]], align 8 ; CHECK-NEXT: ret void ; %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll index 00ac87d6a997..22dfffa722ad 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll @@ -137,19 +137,17 @@ define i8 @k(<4 x i8> %x) { define i8 @k_bb(<4 x i8> %x) { ; CHECK-LABEL: @k_bb( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0 ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3 -; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] -; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X]], [[X]] -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X0X0]], [[X3X3]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sdiv i8 [[TMP2]], [[TMP5]] -; CHECK-NEXT: ret i8 [[TMP6]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i8 [[TMP8]] ; %x0 = extractelement <4 x i8> %x, i32 0 br label %bb1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll b/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll index 885d11acfa15..7d0379eab902 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll @@ -16,18 +16,19 @@ target triple = "i386-apple-macosx10.9.0" define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[N:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[K:%.*]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[K:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[N]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[K]], i32 3 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_024:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_024]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[SHUFFLE]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4 ; CHECK-NEXT: [[ADD10]] = add nsw i32 [[I_024]], 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[ADD10]], 10000 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]