From 6ca48efcf6e16adfcf33688d86de7bd2bb75a49a Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 26 Jul 2021 05:43:53 -0700 Subject: [PATCH] [SLP]Fix costs calculations. Need to fix several cost-related problems. The final type may be defined incorrectly because of to early definition (we may end up with the wider type), the CommonCost should not be redefined in ExtractElements cost related calculations and the shuffle of the final insertelements vectors should be calculated as a cost of single vector permutations + costs of two vector permutations for other n-1 incoming vectors. Differential Revision: https://reviews.llvm.org/D106578 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 24 ++++++++++------- .../X86/vec_list_bias-inseltpoison.ll | 26 +++++++++---------- .../SLPVectorizer/X86/vec_list_bias.ll | 26 +++++++++---------- 3 files changed, 40 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a10302a98751..cc3f5c7d4b48 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3654,7 +3654,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, else if (auto *IE = dyn_cast(VL[0])) ScalarTy = IE->getOperand(1)->getType(); auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); - auto *FinalVecTy = VecTy; TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // If we have computed a smaller type for the expression, update VecTy so @@ -3662,6 +3661,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, if (MinBWs.count(VL[0])) VecTy = FixedVectorType::get( IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + auto *FinalVecTy = VecTy; unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); @@ -3838,7 +3838,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, case Instruction::ExtractElement: { // The common cost of removal ExtractElement/ExtractValue instructions + // the cost of shuffles, if required to resuffle the original vector. - InstructionCost CommonCost = 0; if (NeedToShuffleReuses) { unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { @@ -4133,7 +4132,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, commonAlignment(CommonAlignment, cast(V)->getAlign()); VecLdCost = TTI->getGatherScatterOpCost( Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), - /*VariableMask=*/false, Alignment, CostKind, VL0); + /*VariableMask=*/false, CommonAlignment, CostKind, VL0); } LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost)); return CommonCost + VecLdCost - ScalarLdCost; @@ -4471,7 +4470,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { SmallPtrSet ExtractCostCalculated; InstructionCost ExtractCost = 0; - SmallBitVector IsIdentity; SmallVector VF; SmallVector> ShuffleMask; SmallVector FirstUsers; @@ -4528,15 +4526,12 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { ShuffleMask.emplace_back(VF.back(), UndefMaskElem); FirstUsers.push_back(EU.User); DemandedElts.push_back(APInt::getNullValue(VF.back())); - IsIdentity.push_back(true); VecId = FirstUsers.size() - 1; } else { VecId = std::distance(FirstUsers.begin(), It); } int Idx = *InsertIdx; ShuffleMask[VecId][Idx] = EU.Lane; - IsIdentity.set(IsIdentity.test(VecId) & - (EU.Lane == Idx || EU.Lane == UndefMaskElem)); DemandedElts[VecId].setBit(Idx); } } @@ -4562,7 +4557,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; for (int I = 0, E = FirstUsers.size(); I < E; ++I) { - if (!IsIdentity.test(I)) { + // For the very first element - simple shuffle of the source vector. + int Limit = ShuffleMask[I].size() * 2; + if (I == 0 && + all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) && + !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) { InstructionCost C = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, cast(FirstUsers[I]->getType()), ShuffleMask[I]); @@ -4571,10 +4570,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { << *VectorizableTree.front()->Scalars.front() << ".\n" << "SLP: Current total cost = " << Cost << "\n"); Cost += C; + continue; } + // Other elements - permutation of 2 vectors (the initial one and the next + // Ith incoming vector). unsigned VF = ShuffleMask[I].size(); - for (int &Mask : ShuffleMask[I]) - Mask = (Mask == UndefMaskElem ? 0 : VF) + Mask; + for (unsigned Idx = 0; Idx < VF; ++Idx) { + int &Mask = ShuffleMask[I][Idx]; + Mask = Mask == UndefMaskElem ? Idx : VF + Mask; + } InstructionCost C = TTI->getShuffleCost( TTI::SK_PermuteTwoSrc, cast(FirstUsers[I]->getType()), ShuffleMask[I]); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll index 70c42b87c229..e586f0b649a9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll @@ -40,22 +40,22 @@ define void @test(i32* nocapture %t2) { ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T40]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T15]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T47]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T9]], i32 1 +; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 ; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP7]], i32 1 ; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[T691:%.*]] = shufflevector <8 x i32> [[T67]], <8 x i32> [[TMP7]], <8 x i32> -; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T691]], i32 [[T50]], i32 5 -; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP8]], i32 7 +; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6 +; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], ; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll index 34f38e86bf40..e3e4b0e823a5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll @@ -40,22 +40,22 @@ define void @test(i32* nocapture %t2) { ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T40]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T15]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T47]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T9]], i32 1 +; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 ; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP7]], i32 1 ; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[T691:%.*]] = shufflevector <8 x i32> [[T67]], <8 x i32> [[TMP7]], <8 x i32> -; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T691]], i32 [[T50]], i32 5 -; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP8]], i32 7 +; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6 +; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], ; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4