diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index de3843c58d23..44b69c16e2b0 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15,24 +15,22 @@ // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/DemandedBits.h" -#include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" @@ -47,7 +45,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Vectorize.h" +#include "llvm/Analysis/VectorUtils.h" #include #include #include @@ -366,9 +364,9 @@ public: BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, - DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB) + DominatorTree *Dt, AssumptionCache *AC) : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), - SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), + SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); } @@ -402,7 +400,6 @@ public: BlockScheduling *BS = Iter.second.get(); BS->clear(); } - MinBWs.clear(); } /// \brief Perform LICM and CSE on the newly generated gather sequences. @@ -420,10 +417,6 @@ public: /// vectorization factors. unsigned getVectorElementSize(Value *V); - /// Compute the minimum type sizes required to represent the entries in a - /// vectorizable tree. - void computeMinimumValueSizes(); - private: struct TreeEntry; @@ -921,14 +914,8 @@ private: AliasAnalysis *AA; LoopInfo *LI; DominatorTree *DT; - AssumptionCache *AC; - DemandedBits *DB; /// Instruction builder to construct the vectorized tree. IRBuilder<> Builder; - - /// A map of scalar integer values to the smallest bit width with which they - /// can legally be represented. - MapVector MinBWs; }; #ifndef NDEBUG @@ -1484,12 +1471,6 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - // If we have computed a smaller type for the expression, update VecTy so - // that the costs will be accurate. - if (MinBWs.count(VL[0])) - VecTy = VectorType::get(IntegerType::get(F->getContext(), MinBWs[VL[0]]), - VL.size()); - if (E->NeedToGather) { if (allConstant(VL)) return 0; @@ -1817,19 +1798,9 @@ int BoUpSLP::getTreeCost() { if (EphValues.count(EU.User)) continue; - // If we plan to rewrite the tree in a smaller type, we will need to sign - // extend the extracted value back to the original type. Here, we account - // for the extract and the added cost of the sign extend if needed. - auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]); - VecTy = VectorType::get(MinTy, BundleWidth); - ExtractCost += - TTI->getCastInstrCost(Instruction::SExt, EU.Scalar->getType(), MinTy); - } - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + VectorType *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); + ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + EU.Lane); } int SpillCost = getSpillCost(); @@ -2530,21 +2501,7 @@ Value *BoUpSLP::vectorizeTree() { } Builder.SetInsertPoint(&F->getEntryBlock().front()); - auto *VectorRoot = vectorizeTree(&VectorizableTree[0]); - - // If the vectorized tree can be rewritten in a smaller type, we truncate the - // vectorized root. InstCombine will then rewrite the entire expression. We - // sign extend the extracted values below. - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - if (auto *I = dyn_cast(VectorRoot)) - Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); - auto BundleWidth = VectorizableTree[0].Scalars.size(); - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]); - auto *VecTy = VectorType::get(MinTy, BundleWidth); - auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); - VectorizableTree[0].VectorizedValue = Trunc; - } + vectorizeTree(&VectorizableTree[0]); DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); @@ -2577,8 +2534,6 @@ Value *BoUpSLP::vectorizeTree() { if (PH->getIncomingValue(i) == Scalar) { Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); - if (MinBWs.count(ScalarRoot)) - Ex = Builder.CreateSExt(Ex, Scalar->getType()); CSEBlocks.insert(PH->getIncomingBlock(i)); PH->setOperand(i, Ex); } @@ -2586,16 +2541,12 @@ Value *BoUpSLP::vectorizeTree() { } else { Builder.SetInsertPoint(cast(User)); Value *Ex = Builder.CreateExtractElement(Vec, Lane); - if (MinBWs.count(ScalarRoot)) - Ex = Builder.CreateSExt(Ex, Scalar->getType()); CSEBlocks.insert(cast(User)->getParent()); User->replaceUsesOfWith(Scalar, Ex); } } else { Builder.SetInsertPoint(&F->getEntryBlock().front()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); - if (MinBWs.count(ScalarRoot)) - Ex = Builder.CreateSExt(Ex, Scalar->getType()); CSEBlocks.insert(&F->getEntryBlock()); User->replaceUsesOfWith(Scalar, Ex); } @@ -3164,7 +3115,7 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) { // If the current instruction is a load, update MaxWidth to reflect the // width of the loaded value. else if (isa(I)) - MaxWidth = std::max(MaxWidth, DL.getTypeSizeInBits(Ty)); + MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty)); // Otherwise, we need to visit the operands of the instruction. We only // handle the interesting cases from buildTree here. If an operand is an @@ -3191,171 +3142,6 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) { return MaxWidth; } -// Determine if a value V in a vectorizable expression Expr can be demoted to a -// smaller type with a truncation. We collect the values that will be demoted -// in ToDemote and additional roots that require investigating in Roots. -static bool collectValuesToDemote(Value *V, SmallPtrSetImpl &Expr, - SmallVectorImpl &ToDemote, - SmallVectorImpl &Roots) { - - // We can always demote constants. - if (isa(V)) { - ToDemote.push_back(V); - return true; - } - - // If the value is not an instruction in the expression with only one use, it - // cannot be demoted. - auto *I = dyn_cast(V); - if (!I || !I->hasOneUse() || !Expr.count(I)) - return false; - - switch (I->getOpcode()) { - - // We can always demote truncations and extensions. Since truncations can - // seed additional demotion, we save the truncated value. - case Instruction::Trunc: - Roots.push_back(I->getOperand(0)); - case Instruction::ZExt: - case Instruction::SExt: - break; - - // We can demote certain binary operations if we can demote both of their - // operands. - case Instruction::Add: - case Instruction::Sub: - case Instruction::Mul: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) || - !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots)) - return false; - break; - - // We can demote selects if we can demote their true and false values. - case Instruction::Select: { - SelectInst *SI = cast(I); - if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) || - !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots)) - return false; - break; - } - - // We can demote phis if we can demote all their incoming operands. Note that - // we don't need to worry about cycles since we ensure single use above. - case Instruction::PHI: { - PHINode *PN = cast(I); - for (Value *IncValue : PN->incoming_values()) - if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots)) - return false; - break; - } - - // Otherwise, conservatively give up. - default: - return false; - } - - // Record the value that we can demote. - ToDemote.push_back(V); - return true; -} - -void BoUpSLP::computeMinimumValueSizes() { - auto &DL = F->getParent()->getDataLayout(); - - // If there are no external uses, the expression tree must be rooted by a - // store. We can't demote in-memory values, so there is nothing to do here. - if (ExternalUses.empty()) - return; - - // We only attempt to truncate integer expressions. - auto &TreeRoot = VectorizableTree[0].Scalars; - auto *TreeRootIT = dyn_cast(TreeRoot[0]->getType()); - if (!TreeRootIT) - return; - - // If the expression is not rooted by a store, these roots should have - // external uses. We will rely on InstCombine to rewrite the expression in - // the narrower type. However, InstCombine only rewrites single-use values. - // This means that if a tree entry other than a root is used externally, it - // must have multiple uses and InstCombine will not rewrite it. The code - // below ensures that only the roots are used externally. - SmallPtrSet Expr(TreeRoot.begin(), TreeRoot.end()); - for (auto &EU : ExternalUses) - if (!Expr.erase(EU.Scalar)) - return; - if (!Expr.empty()) - return; - - // Collect the scalar values in one lane of the vectorizable expression. We - // will use this context to determine which values can be demoted. If we see - // a truncation, we mark it as seeding another demotion. - for (auto &Entry : VectorizableTree) - Expr.insert(Entry.Scalars[0]); - - // Ensure the root of the vectorizable tree doesn't form a cycle. It must - // have a single external user that is not in the vectorizable tree. - if (!TreeRoot[0]->hasOneUse() || Expr.count(*TreeRoot[0]->user_begin())) - return; - - // Conservatively determine if we can actually truncate the root of the - // expression. Collect the values that can be demoted in ToDemote and - // additional roots that require investigating in Roots. - SmallVector ToDemote; - SmallVector Roots; - if (!collectValuesToDemote(TreeRoot[0], Expr, ToDemote, Roots)) - return; - - // The maximum bit width required to represent all the values that can be - // demoted without loss of precision. It would be safe to truncate the root - // of the expression to this width. - auto MaxBitWidth = 8u; - - // We first check if all the bits of the root are demanded. If they're not, - // we can truncate the root to this narrower type. - auto Mask = DB->getDemandedBits(cast(TreeRoot[0])); - if (Mask.countLeadingZeros() > 0) - MaxBitWidth = std::max( - Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth); - - // If all the bits of the root are demanded, we can try a little harder to - // compute a narrower type. This can happen, for example, if the roots are - // getelementptr indices. InstCombine promotes these indices to the pointer - // width. Thus, all their bits are technically demanded even though the - // address computation might be vectorized in a smaller type. - // - // We start by looking at each entry that can be demoted. We compute the - // maximum bit width required to store the scalar by using ValueTracking to - // compute the number of high-order bits we can truncate. - else - for (auto *Scalar : ToDemote) { - auto NumSignBits = ComputeNumSignBits(Scalar, DL, 0, AC, 0, DT); - auto NumTypeBits = DL.getTypeSizeInBits(Scalar->getType()); - MaxBitWidth = std::max(NumTypeBits - NumSignBits, MaxBitWidth); - } - - // Round MaxBitWidth up to the next power-of-two. - if (!isPowerOf2_64(MaxBitWidth)) - MaxBitWidth = NextPowerOf2(MaxBitWidth); - - // If the maximum bit width we compute is less than the with of the roots' - // type, we can proceed with the narrowing. Otherwise, do nothing. - if (MaxBitWidth >= TreeRootIT->getBitWidth()) - return; - - // If we can truncate the root, we must collect additional values that might - // be demoted as a result. That is, those seeded by truncations we will - // modify. - while (!Roots.empty()) - collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots); - - // Finally, map the values we can demote to the maximum bit with we computed. - for (auto *Scalar : ToDemote) - MinBWs[Scalar] = MaxBitWidth; -} - /// The SLPVectorizer Pass. struct SLPVectorizer : public FunctionPass { typedef SmallVector StoreList; @@ -3377,7 +3163,6 @@ struct SLPVectorizer : public FunctionPass { LoopInfo *LI; DominatorTree *DT; AssumptionCache *AC; - DemandedBits *DB; bool runOnFunction(Function &F) override { if (skipOptnoneFunction(F)) @@ -3391,7 +3176,6 @@ struct SLPVectorizer : public FunctionPass { LI = &getAnalysis().getLoopInfo(); DT = &getAnalysis().getDomTree(); AC = &getAnalysis().getAssumptionCache(F); - DB = &getAnalysis(); Stores.clear(); GEPs.clear(); @@ -3421,7 +3205,7 @@ struct SLPVectorizer : public FunctionPass { // Use the bottom up slp vectorizer to construct chains that start with // store instructions. - BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB); + BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC); // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to // delete instructions. @@ -3464,7 +3248,6 @@ struct SLPVectorizer : public FunctionPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); @@ -3569,7 +3352,6 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef Chain, ArrayRef Operands = Chain.slice(i, VF); R.buildTree(Operands); - R.computeMinimumValueSizes(); int Cost = R.getTreeCost(); @@ -3769,7 +3551,6 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, Value *ReorderedOps[] = { Ops[1], Ops[0] }; R.buildTree(ReorderedOps, None); } - R.computeMinimumValueSizes(); int Cost = R.getTreeCost(); if (Cost < -SLPCostThreshold) { @@ -4036,7 +3817,6 @@ public: for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps); - V.computeMinimumValueSizes(); // Estimate cost. int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll index 9c06b24163ac..59ceba1717a1 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll @@ -1,5 +1,4 @@ -; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=PROFITABLE -; RUN: opt -S -slp-vectorizer -slp-threshold=-12 -dce -instcombine < %s | FileCheck %s --check-prefix=UNPROFITABLE +; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" @@ -19,13 +18,13 @@ target triple = "aarch64--linux-gnu" ; return sum; ; } -; PROFITABLE-LABEL: @gather_reduce_8x16_i32 +; CHECK-LABEL: @gather_reduce_8x16_i32 ; -; PROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> -; PROFITABLE: zext <8 x i16> [[L]] to <8 x i32> -; PROFITABLE: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> -; PROFITABLE: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] -; PROFITABLE: sext i32 [[X]] to i64 +; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> +; CHECK: zext <8 x i16> [[L]] to <8 x i32> +; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> +; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] +; CHECK: sext i32 [[X]] to i64 ; define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { entry: @@ -138,18 +137,14 @@ for.body: br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body } -; UNPROFITABLE-LABEL: @gather_reduce_8x16_i64 +; CHECK-LABEL: @gather_reduce_8x16_i64 ; -; UNPROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> -; UNPROFITABLE: zext <8 x i16> [[L]] to <8 x i32> -; UNPROFITABLE: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> -; UNPROFITABLE: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] -; UNPROFITABLE: sext i32 [[X]] to i64 +; CHECK-NOT: load <8 x i16> ; -; TODO: Although we can now vectorize this case while converting the i64 -; subtractions to i32, the cost model currently finds vectorization to be -; unprofitable. The cost model is penalizing the sign and zero -; extensions in the vectorized version, but they are actually free. +; FIXME: We are currently unable to vectorize the case with i64 subtraction +; because the zero extensions are too expensive. The solution here is to +; convert the i64 subtractions to i32 subtractions during vectorization. +; This would then match the case above. ; define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { entry: