From afc9e7517ada7568b60187a1eeb104e8a6b03877 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 12 Nov 2021 09:09:15 -0800 Subject: [PATCH] [SLP]Improve cost model for the shuffled extracts. Improved the calculation of the shuffled extracts, where possible. Need to calculate the cost for the extracted scalars if some users are not insertelements + improved the total estimation of the shuffled scalars used in insertelements build vectors. Differential Revision: https://reviews.llvm.org/D113782 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 118 +++++++++++++----- .../X86/cmp_commute-inseltpoison.ll | 19 +-- .../SLPVectorizer/X86/cmp_commute.ll | 19 +-- .../X86/load-merge-inseltpoison.ll | 10 +- .../SLPVectorizer/X86/load-merge.ll | 10 +- .../X86/vec_list_bias-inseltpoison.ll | 35 +++--- .../SLPVectorizer/X86/vec_list_bias.ll | 35 +++--- 7 files changed, 155 insertions(+), 91 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 44b261a53d13..fb18da0ddc69 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5401,7 +5401,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { SmallVector DemandedElts; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) + if (!isa_and_nonnull(EU.User) && + !ExtractCostCalculated.insert(EU.Scalar).second) continue; // Uses by ephemeral values are free (because the ephemeral value will be @@ -5449,7 +5450,22 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { if (It == FirstUsers.end()) { VF.push_back(FTy->getNumElements()); ShuffleMask.emplace_back(VF.back(), UndefMaskElem); - FirstUsers.push_back(EU.User); + // Find the insertvector, vectorized in tree, if any. + Value *Base = VU; + while (isa(Base)) { + // Build the mask for the vectorized insertelement instructions. + if (const TreeEntry *E = getTreeEntry(Base)) { + VU = Base; + do { + int Idx = E->findLaneForValue(Base); + ShuffleMask.back()[Idx] = Idx; + Base = cast(Base)->getOperand(0); + } while (E == getTreeEntry(Base)); + break; + } + Base = cast(Base)->getOperand(0); + } + FirstUsers.push_back(VU); DemandedElts.push_back(APInt::getZero(VF.back())); VecId = FirstUsers.size() - 1; } else { @@ -5458,6 +5474,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { int Idx = *InsertIdx; ShuffleMask[VecId][Idx] = EU.Lane; DemandedElts[VecId].setBit(Idx); + continue; } } @@ -5481,47 +5498,86 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; - for (int I = 0, E = FirstUsers.size(); I < E; ++I) { - // For the very first element - simple shuffle of the source vector. - int Limit = ShuffleMask[I].size() * 2; - if (I == 0 && - all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) && - !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) { + if (FirstUsers.size() == 1) { + int Limit = ShuffleMask.front().size() * 2; + if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) && + !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) { InstructionCost C = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, - cast(FirstUsers[I]->getType()), ShuffleMask[I]); + cast(FirstUsers.front()->getType()), + ShuffleMask.front()); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for final shuffle of insertelement external users " << *VectorizableTree.front()->Scalars.front() << ".\n" << "SLP: Current total cost = " << Cost << "\n"); Cost += C; - continue; } - // Other elements - permutation of 2 vectors (the initial one and the next - // Ith incoming vector). - unsigned VF = ShuffleMask[I].size(); - for (unsigned Idx = 0; Idx < VF; ++Idx) { - int &Mask = ShuffleMask[I][Idx]; - Mask = Mask == UndefMaskElem ? Idx : VF + Mask; - } - InstructionCost C = TTI->getShuffleCost( - TTI::SK_PermuteTwoSrc, cast(FirstUsers[I]->getType()), - ShuffleMask[I]); - LLVM_DEBUG( - dbgs() - << "SLP: Adding cost " << C - << " for final shuffle of vector node and external insertelement users " - << *VectorizableTree.front()->Scalars.front() << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost += C; InstructionCost InsertCost = TTI->getScalarizationOverhead( - cast(FirstUsers[I]->getType()), DemandedElts[I], - /*Insert*/ true, - /*Extract*/ false); - Cost -= InsertCost; + cast(FirstUsers.front()->getType()), + DemandedElts.front(), /*Insert*/ true, /*Extract*/ false); LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost << " for insertelements gather.\n" << "SLP: Current total cost = " << Cost << "\n"); + Cost -= InsertCost; + } else if (FirstUsers.size() >= 2) { + unsigned MaxVF = *std::max_element(VF.begin(), VF.end()); + // Combined masks of the first 2 vectors. + SmallVector CombinedMask(MaxVF, UndefMaskElem); + copy(ShuffleMask.front(), CombinedMask.begin()); + APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF); + auto *VecTy = FixedVectorType::get( + cast(FirstUsers.front()->getType())->getElementType(), + MaxVF); + for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) { + if (ShuffleMask[1][I] != UndefMaskElem) { + CombinedMask[I] = ShuffleMask[1][I] + MaxVF; + CombinedDemandedElts.setBit(I); + } + } + InstructionCost C = + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of vector node and external " + "insertelement users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + InstructionCost InsertCost = TTI->getScalarizationOverhead( + VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false); + LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost + << " for insertelements gather.\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost -= InsertCost; + for (int I = 2, E = FirstUsers.size(); I < E; ++I) { + // Other elements - permutation of 2 vectors (the initial one and the + // next Ith incoming vector). + unsigned VF = ShuffleMask[I].size(); + for (unsigned Idx = 0; Idx < VF; ++Idx) { + int Mask = ShuffleMask[I][Idx]; + if (Mask != UndefMaskElem) + CombinedMask[Idx] = MaxVF + Mask; + else if (CombinedMask[Idx] != UndefMaskElem) + CombinedMask[Idx] = Idx; + } + for (unsigned Idx = VF; Idx < MaxVF; ++Idx) + if (CombinedMask[Idx] != UndefMaskElem) + CombinedMask[Idx] = Idx; + InstructionCost C = + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of vector node and external " + "insertelement users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + InstructionCost InsertCost = TTI->getScalarizationOverhead( + cast(FirstUsers[I]->getType()), DemandedElts[I], + /*Insert*/ true, /*Extract*/ false); + LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost + << " for insertelements gather.\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost -= InsertCost; + } } #ifndef NDEBUG diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll index 5cb0bf5b27d9..85a79014506a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll @@ -236,22 +236,23 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) { define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) { ; CHECK-LABEL: @fcmp_ord_uno_v4i32( -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 ; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 -; CHECK-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] -; CHECK-NEXT: [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> -; CHECK-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> +; CHECK-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> +; CHECK-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> ; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll index 6a80c41fb5dd..5e99d77106ec 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll @@ -236,22 +236,23 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) { define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) { ; CHECK-LABEL: @fcmp_ord_uno_v4i32( -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 ; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 -; CHECK-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] -; CHECK-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> -; CHECK-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> +; CHECK-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> +; CHECK-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> ; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll index f9a5434b0400..3d68b9b8fa06 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -54,12 +54,14 @@ define <4 x float> @PR16739_byref(<4 x float>* nocapture readonly dereferenceabl ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* +; CHECK-NEXT: [[X0:%.*]] = load float, float* [[GEP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4 +; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> poison, float [[X0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[TMP3]], float [[X2]], i32 2 -; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3 +; CHECK-NEXT: [[I21:%.*]] = shufflevector <4 x float> [[I0]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I21]], float [[TMP4]], i32 3 ; CHECK-NEXT: ret <4 x float> [[I3]] ; %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll index a8c83781d3aa..2683ec1e1d72 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -54,12 +54,14 @@ define <4 x float> @PR16739_byref(<4 x float>* nocapture readonly dereferenceabl ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* +; CHECK-NEXT: [[X0:%.*]] = load float, float* [[GEP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4 +; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[X0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[TMP3]], float [[X2]], i32 2 -; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3 +; CHECK-NEXT: [[I21:%.*]] = shufflevector <4 x float> [[I0]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I21]], float [[TMP4]], i32 3 ; CHECK-NEXT: ret <4 x float> [[I3]] ; %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll index e586f0b649a9..dca8d192fb20 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll @@ -30,7 +30,6 @@ define void @test(i32* nocapture %t2) { ; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] ; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 -; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270 ; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 ; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] @@ -40,22 +39,24 @@ define void @test(i32* nocapture %t2) { ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 -; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP7]], i32 1 -; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2 -; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP8]], <8 x i32> -; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6 -; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T40]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[T9]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T47]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0 +; CHECK-NEXT: [[T69:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i32 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1 +; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP13]], i32 5 +; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3 +; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP14]], i32 7 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], ; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll index e3e4b0e823a5..c2f5f1ba5b7d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll @@ -30,7 +30,6 @@ define void @test(i32* nocapture %t2) { ; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] ; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 -; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270 ; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 ; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] @@ -40,22 +39,24 @@ define void @test(i32* nocapture %t2) { ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 -; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP7]], i32 1 -; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2 -; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP8]], <8 x i32> -; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6 -; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T40]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[T9]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T47]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0 +; CHECK-NEXT: [[T69:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i32 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1 +; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP13]], i32 5 +; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3 +; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP14]], i32 7 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], ; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4