From bad7d6b3735d1d855ffb07f32a272049cff085e6 Mon Sep 17 00:00:00 2001 From: Francesco Petrogalli Date: Mon, 24 Aug 2020 13:49:27 +0000 Subject: [PATCH] Revert "[llvm][LV] Replace `unsigned VF` with `ElementCount VF` [NFCI]" Reverting because the commit message doesn't reflect the one agreed on phabricator at https://reviews.llvm.org/D85794. This reverts commit c8d2b065b98fa91139cc7bb1fd1407f032ef252e. --- .../llvm/Analysis/TargetTransformInfo.h | 5 - llvm/include/llvm/Analysis/VectorUtils.h | 16 +- llvm/include/llvm/IR/DiagnosticInfo.h | 2 - llvm/include/llvm/Support/TypeSize.h | 25 - llvm/lib/IR/DiagnosticInfo.cpp | 7 - .../Vectorize/LoopVectorizationPlanner.h | 19 +- .../Transforms/Vectorize/LoopVectorize.cpp | 707 ++++++++---------- llvm/lib/Transforms/Vectorize/VPlan.cpp | 18 +- llvm/lib/Transforms/Vectorize/VPlan.h | 27 +- 9 files changed, 335 insertions(+), 491 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index a3e624842700..06d354411af6 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -128,11 +128,6 @@ public: IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, unsigned Factor); - IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, - ElementCount Factor) - : IntrinsicCostAttributes(Id, CI, Factor.Min) { - assert(!Factor.Scalable); - } IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, unsigned Factor, unsigned ScalarCost); diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index 527bba67b257..f77048d45d01 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -300,17 +300,13 @@ namespace Intrinsic { typedef unsigned ID; } -/// A helper function for converting Scalar types to vector types. If -/// the incoming type is void, we return void. If the EC represents a -/// scalar, we return the scalar type. -inline Type *ToVectorTy(Type *Scalar, ElementCount EC) { - if (Scalar->isVoidTy() || EC.isScalar()) +/// A helper function for converting Scalar types to vector types. +/// If the incoming type is void, we return void. If the VF is 1, we return +/// the scalar type. +inline Type *ToVectorTy(Type *Scalar, unsigned VF, bool isScalable = false) { + if (Scalar->isVoidTy() || VF == 1) return Scalar; - return VectorType::get(Scalar, EC); -} - -inline Type *ToVectorTy(Type *Scalar, unsigned VF) { - return ToVectorTy(Scalar, ElementCount::getFixed(VF)); + return VectorType::get(Scalar, ElementCount::get(VF, isScalable)); } /// Identify if the intrinsic is trivially vectorizable. diff --git a/llvm/include/llvm/IR/DiagnosticInfo.h b/llvm/include/llvm/IR/DiagnosticInfo.h index 33736321b42b..b7e0ecde8629 100644 --- a/llvm/include/llvm/IR/DiagnosticInfo.h +++ b/llvm/include/llvm/IR/DiagnosticInfo.h @@ -21,7 +21,6 @@ #include "llvm/ADT/Twine.h" #include "llvm/IR/DebugLoc.h" #include "llvm/Support/CBindingWrapping.h" -#include "llvm/Support/TypeSize.h" #include "llvm/Support/YAMLTraits.h" #include #include @@ -435,7 +434,6 @@ public: Argument(StringRef Key, unsigned N); Argument(StringRef Key, unsigned long N); Argument(StringRef Key, unsigned long long N); - Argument(StringRef Key, ElementCount EC); Argument(StringRef Key, bool B) : Key(Key), Val(B ? "true" : "false") {} Argument(StringRef Key, DebugLoc dl); }; diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h index 8b346ad673d8..a7f5b849bcc1 100644 --- a/llvm/include/llvm/Support/TypeSize.h +++ b/llvm/include/llvm/Support/TypeSize.h @@ -67,33 +67,8 @@ public: static ElementCount get(unsigned Min, bool Scalable) { return {Min, Scalable}; } - - /// Printing function. - void print(raw_ostream &OS) const { - if (Scalable) - OS << "vscale x "; - OS << Min; - } - /// Counting predicates. - /// - /// Notice that Min = 1 and Scalable = true is considered more than - /// one element. - /// - ///@{ No elements.. - bool isZero() const { return Min == 0; } - /// Exactly one element. - bool isScalar() const { return !Scalable && Min == 1; } - /// One or more elements. - bool isVector() const { return (Scalable && Min != 0) || Min > 1; } - ///@} }; -/// Stream operator function for `ElementCount`. -inline raw_ostream &operator<<(raw_ostream &OS, const ElementCount &EC) { - EC.print(OS); - return OS; -} - // This class is used to represent the size of types. If the type is of fixed // size, it will represent the exact size. If the type is a scalable vector, // it will represent the known minimum size. diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp index 28882cfa8f65..6528c723fbfa 100644 --- a/llvm/lib/IR/DiagnosticInfo.cpp +++ b/llvm/lib/IR/DiagnosticInfo.cpp @@ -213,13 +213,6 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, unsigned long long N) : Key(std::string(Key)), Val(utostr(N)) {} -DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, - ElementCount EC) - : Key(std::string(Key)) { - raw_string_ostream OS(Val); - EC.print(OS); -} - DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, DebugLoc Loc) : Key(std::string(Key)), Loc(Loc) { if (Loc) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 8c3dff69e072..ecf6c8402cd6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -172,14 +172,12 @@ public: /// Information about vectorization costs struct VectorizationFactor { // Vector width with best cost - ElementCount Width; + unsigned Width; // Cost of the loop with that width unsigned Cost; // Width 1 means no vectorization, cost 0 means uncomputed cost. - static VectorizationFactor Disabled() { - return {ElementCount::getFixed(1), 0}; - } + static VectorizationFactor Disabled() { return {1, 0}; } bool operator==(const VectorizationFactor &rhs) const { return Width == rhs.Width && Cost == rhs.Cost; @@ -229,10 +227,7 @@ class LoopVectorizationPlanner { /// A builder used to construct the current plan. VPBuilder Builder; - /// The best number of elements of the vector types used in the - /// transformed loop. BestVF = None means that vectorization is - /// disabled. - Optional BestVF = None; + unsigned BestVF = 0; unsigned BestUF = 0; public: @@ -247,14 +242,14 @@ public: /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional plan(ElementCount UserVF, unsigned UserIC); + Optional plan(unsigned UserVF, unsigned UserIC); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. - VectorizationFactor planInVPlanNativePath(ElementCount UserVF); + VectorizationFactor planInVPlanNativePath(unsigned UserVF); /// Finalize the best decision and dispose of all other VPlans. - void setBestPlan(ElementCount VF, unsigned UF); + void setBestPlan(unsigned VF, unsigned UF); /// Generate the IR code for the body of the vectorized loop according to the /// best selected VPlan. @@ -269,7 +264,7 @@ public: /// \p Predicate on Range.Start, possibly decreasing Range.End such that the /// returned value holds for the entire \p Range. static bool - getDecisionAndClampRange(const std::function &Predicate, + getDecisionAndClampRange(const std::function &Predicate, VFRange &Range); protected: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ecc41db21a9a..86f15500d838 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -318,12 +318,11 @@ static Type *getMemInstValueType(Value *I) { /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type at the given vectorization factor. -static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { - assert(!VF.Scalable && "scalable vectors not yet supported."); +static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { // Determine if an array of VF elements of type Ty is "bitcast compatible" // with a vector. - if (VF.isVector()) { - auto *VectorTy = VectorType::get(Ty, VF); + if (VF > 1) { + auto *VectorTy = FixedVectorType::get(Ty, VF); return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); } @@ -405,7 +404,7 @@ public: LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, ElementCount VecWidth, + OptimizationRemarkEmitter *ORE, unsigned VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) @@ -455,13 +454,13 @@ public: /// Vectorize a single GetElementPtrInst based on information gathered and /// decisions taken during planning. void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, - ElementCount VF, bool IsPtrLoopInvariant, + unsigned VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); + void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane @@ -749,7 +748,7 @@ protected: /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. - ElementCount VF; + unsigned VF; /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. @@ -838,9 +837,8 @@ public: LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) - : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - ElementCount::getFixed(1), UnrollFactor, LVL, CM, - BFI, PSI) {} + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, + UnrollFactor, LVL, CM, BFI, PSI) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -876,8 +874,7 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) const DILocation *DIL = Inst->getDebugLoc(); if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && !isa(Inst)) { - assert(!VF.Scalable && "scalable vectors not yet supported."); - auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF.Min); + auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); if (NewDIL) B.SetCurrentDebugLocation(NewDIL.getValue()); else @@ -1042,7 +1039,7 @@ public: VectorizationFactor selectVectorizationFactor(unsigned MaxVF); /// Setup cost-based decisions for user vectorization factor. - void selectUserVectorizationFactor(ElementCount UserVF) { + void selectUserVectorizationFactor(unsigned UserVF) { collectUniformsAndScalars(UserVF); collectInstsToScalarize(UserVF); } @@ -1056,7 +1053,7 @@ public: /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); + unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1065,7 +1062,7 @@ public: /// the lists of loop-uniform and loop-scalar instructions. /// The calculated cost is saved with widening decision in order to /// avoid redundant calculations. - void setCostBasedWideningDecision(ElementCount VF); + void setCostBasedWideningDecision(unsigned VF); /// A struct that represents some properties of the register usage /// of a loop. @@ -1080,8 +1077,7 @@ public: /// \return Returns information about the register usages of the loop for the /// given vectorization factors. - SmallVector - calculateRegisterUsage(ArrayRef VFs); + SmallVector calculateRegisterUsage(ArrayRef VFs); /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); @@ -1099,9 +1095,8 @@ public: /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. - bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { - assert(VF.isVector() && - "Profitable to scalarize relevant only for VF > 1."); + bool isProfitableToScalarize(Instruction *I, unsigned VF) const { + assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. @@ -1115,8 +1110,8 @@ public: } /// Returns true if \p I is known to be uniform after vectorization. - bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { - if (VF.isScalar()) + bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { + if (VF == 1) return true; // Cost model is not run in the VPlan-native path - return conservative @@ -1131,8 +1126,8 @@ public: } /// Returns true if \p I is known to be scalar after vectorization. - bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { - if (VF.isScalar()) + bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { + if (VF == 1) return true; // Cost model is not run in the VPlan-native path - return conservative @@ -1148,8 +1143,8 @@ public: /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. - bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { - return VF.isVector() && MinBWs.find(I) != MinBWs.end() && + bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { + return VF > 1 && MinBWs.find(I) != MinBWs.end() && !isProfitableToScalarize(I, VF) && !isScalarAfterVectorization(I, VF); } @@ -1166,17 +1161,17 @@ public: /// Save vectorization decision \p W and \p Cost taken by the cost model for /// instruction \p I and vector width \p VF. - void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, + void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, unsigned Cost) { - assert(VF.isVector() && "Expected VF >=2"); + assert(VF >= 2 && "Expected VF >=2"); WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); } /// Save vectorization decision \p W and \p Cost taken by the cost model for /// interleaving group \p Grp and vector width \p VF. - void setWideningDecision(const InterleaveGroup *Grp, - ElementCount VF, InstWidening W, unsigned Cost) { - assert(VF.isVector() && "Expected VF >=2"); + void setWideningDecision(const InterleaveGroup *Grp, unsigned VF, + InstWidening W, unsigned Cost) { + assert(VF >= 2 && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. /// But the cost will be assigned to one instruction only. for (unsigned i = 0; i < Grp->getFactor(); ++i) { @@ -1192,16 +1187,15 @@ public: /// Return the cost model decision for the given instruction \p I and vector /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. - InstWidening getWideningDecision(Instruction *I, ElementCount VF) { - assert(!VF.Scalable && "scalable vectors not yet supported."); - assert(VF.isVector() && "Expected VF >=2"); + InstWidening getWideningDecision(Instruction *I, unsigned VF) { + assert(VF >= 2 && "Expected VF >=2"); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) return CM_GatherScatter; - std::pair InstOnVF = std::make_pair(I, VF); + std::pair InstOnVF = std::make_pair(I, VF); auto Itr = WideningDecisions.find(InstOnVF); if (Itr == WideningDecisions.end()) return CM_Unknown; @@ -1210,9 +1204,9 @@ public: /// Return the vectorization cost for the given instruction \p I and vector /// width \p VF. - unsigned getWideningCost(Instruction *I, ElementCount VF) { - assert(VF.isVector() && "Expected VF >=2"); - std::pair InstOnVF = std::make_pair(I, VF); + unsigned getWideningCost(Instruction *I, unsigned VF) { + assert(VF >= 2 && "Expected VF >=2"); + std::pair InstOnVF = std::make_pair(I, VF); assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && "The cost is not calculated"); return WideningDecisions[InstOnVF].second; @@ -1221,7 +1215,7 @@ public: /// Return True if instruction \p I is an optimizable truncate whose operand /// is an induction variable. Such a truncate will be removed by adding a new /// induction variable with the destination type. - bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { + bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { // If the instruction is not a truncate, return false. auto *Trunc = dyn_cast(I); if (!Trunc) @@ -1246,14 +1240,14 @@ public: /// Collects the instructions to scalarize for each predicated instruction in /// the loop. - void collectInstsToScalarize(ElementCount VF); + void collectInstsToScalarize(unsigned VF); /// Collect Uniform and Scalar values for the given \p VF. /// The sets depend on CM decision for Load/Store instructions /// that may be vectorized as interleave, gather-scatter or scalarized. - void collectUniformsAndScalars(ElementCount VF) { + void collectUniformsAndScalars(unsigned VF) { // Do the analysis once. - if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) + if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) return; setCostBasedWideningDecision(VF); collectLoopUniforms(VF); @@ -1304,8 +1298,7 @@ public: /// instructions that may divide by zero. /// If a non-zero VF has been calculated, we check if I will be scalarized /// predication for that VF. - bool isScalarWithPredication(Instruction *I, - ElementCount VF = ElementCount::getFixed(1)); + bool isScalarWithPredication(Instruction *I, unsigned VF = 1); // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. @@ -1322,16 +1315,12 @@ public: /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. - bool - memoryInstructionCanBeWidened(Instruction *I, - ElementCount VF = ElementCount::getFixed(1)); + bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); /// Returns true if \p I is a memory instruction in an interleaved-group /// of memory accesses that can be vectorized with wide vector loads/stores /// and shuffles. - bool - interleavedAccessCanBeWidened(Instruction *I, - ElementCount VF = ElementCount::getFixed(1)); + bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { @@ -1383,15 +1372,14 @@ public: /// Estimate cost of an intrinsic call instruction CI if it were vectorized /// with factor VF. Return the cost of the instruction, including /// scalarization overhead if it's needed. - unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); + unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); /// Estimate cost of a call instruction CI if it were vectorized with factor /// VF. Return the cost of the instruction, including scalarization overhead /// if it's needed. The flag NeedToScalarize shows if the call needs to be /// scalarized - /// i.e. either vector version isn't available, or is too expensive. - unsigned getVectorCallCost(CallInst *CI, ElementCount VF, - bool &NeedToScalarize); + unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { @@ -1421,41 +1409,41 @@ private: /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. - VectorizationCostTy expectedCost(ElementCount VF); + VectorizationCostTy expectedCost(unsigned VF); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); + VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. - unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); + unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); /// Calculate vectorization cost of memory instruction \p I. - unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); + unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); /// The cost computation for scalarized memory instruction. - unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); + unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); /// The cost computation for interleaving group of memory instructions. - unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); + unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); /// The cost computation for Gather/Scatter instruction. - unsigned getGatherScatterCost(Instruction *I, ElementCount VF); + unsigned getGatherScatterCost(Instruction *I, unsigned VF); /// The cost computation for widening instruction \p I with consecutive /// memory access. - unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); + unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); /// The cost calculation for Load/Store instruction \p I with uniform pointer - /// Load: scalar load + broadcast. /// Store: scalar store + (loop invariant value stored? 0 : extract of last /// element) - unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); + unsigned getUniformMemOpCost(Instruction *I, unsigned VF); /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. - unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); + unsigned getScalarizationOverhead(Instruction *I, unsigned VF); /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. @@ -1495,19 +1483,19 @@ private: /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated /// vectorization factor. The entries are VF-ScalarCostTy pairs. - DenseMap InstsToScalarize; + DenseMap InstsToScalarize; /// Holds the instructions known to be uniform after vectorization. /// The data is collected per VF. - DenseMap> Uniforms; + DenseMap> Uniforms; /// Holds the instructions known to be scalar after vectorization. /// The data is collected per VF. - DenseMap> Scalars; + DenseMap> Scalars; /// Holds the instructions (address computations) that are forced to be /// scalarized. - DenseMap> ForcedScalars; + DenseMap> ForcedScalars; /// PHINodes of the reductions that should be expanded in-loop along with /// their associated chains of reduction operations, in program order from top @@ -1520,7 +1508,7 @@ private: /// non-negative return value implies the expression will be scalarized. /// Currently, only single-use chains are considered for scalarization. int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, - ElementCount VF); + unsigned VF); /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in @@ -1531,28 +1519,27 @@ private: /// scalarized instruction will be represented by VF scalar values in the /// vectorized loop, each corresponding to an iteration of the original /// scalar loop. - void collectLoopUniforms(ElementCount VF); + void collectLoopUniforms(unsigned VF); /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized /// during vectorization. Non-uniform scalarized instructions will be /// represented by VF values in the vectorized loop, each corresponding to an /// iteration of the original scalar loop. - void collectLoopScalars(ElementCount VF); + void collectLoopScalars(unsigned VF); /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. - using DecisionList = DenseMap, + using DecisionList = DenseMap, std::pair>; DecisionList WideningDecisions; /// Returns true if \p V is expected to be vectorized and it needs to be /// extracted. - bool needsExtract(Value *V, ElementCount VF) const { + bool needsExtract(Value *V, unsigned VF) const { Instruction *I = dyn_cast(V); - if (VF.isScalar() || !I || !TheLoop->contains(I) || - TheLoop->isLoopInvariant(I)) + if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) return false; // Assume we can vectorize V (and hence we need extraction) if the @@ -1567,7 +1554,7 @@ private: /// Returns a range containing only operands needing to be extracted. SmallVector filterExtractingOperands(Instruction::op_range Ops, - ElementCount VF) { + unsigned VF) { return SmallVector(make_filter_range( Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } @@ -1814,7 +1801,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // Multiply the vectorization factor by the step using integer or // floating-point arithmetic as appropriate. - Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF.Min); + Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); // Create a vector splat to use in the induction update. @@ -1822,9 +1809,9 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. - assert(!VF.Scalable && "scalable vectors not yet supported."); Value *SplatVF = isa(Mul) - ? ConstantVector::getSplat(VF, cast(Mul)) + ? ConstantVector::getSplat(ElementCount::getFixed(VF), + cast(Mul)) : Builder.CreateVectorSplat(VF, Mul); Builder.restoreIP(CurrIP); @@ -1959,9 +1946,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { - assert(!VF.Scalable && "scalable vectors not yet supported."); - Value *EntryPart = getStepVector(Broadcasted, VF.Min * Part, Step, - ID.getInductionOpcode()); + Value *EntryPart = + getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); if (Trunc) addMetadata(EntryPart, Trunc); @@ -1971,7 +1957,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); - if (VF.isZero() || VF.isScalar()) { + if (VF <= 1) { Value *ScalarIV = CreateScalarIV(Step); CreateSplatIV(ScalarIV, Step); return; @@ -2069,9 +2055,8 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. - assert(VF.isVector() && "VF should be greater than one"); - assert(!VF.Scalable && - "the code below assumes a fixed number of elements at compile time"); + assert(VF > 1 && "VF should be greater than one"); + // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); assert(ScalarIVTy == Step->getType() && @@ -2093,14 +2078,12 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Cost->isUniformAfterVectorization(cast(EntryVal), VF) - ? 1 - : VF.Min; + Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 + : VF; // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *StartIdx = - getSignedIntOrFpConstant(ScalarIVTy, VF.Min * Part + Lane); + auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); @@ -2143,9 +2126,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // is known to be uniform after vectorization, this corresponds to lane zero // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. - assert(!VF.Scalable && "scalable vectors not yet supported."); - unsigned LastLane = - Cost->isUniformAfterVectorization(I, VF) ? 0 : VF.Min - 1; + unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; auto *LastInst = cast( VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); @@ -2167,10 +2148,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { // Initialize packing with insertelements to start from undef. - assert(!VF.Scalable && "VF is assumed to be non scalable."); - Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); + Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF)); VectorLoopValueMap.setVectorValue(V, Part, Undef); - for (unsigned Lane = 0; Lane < VF.Min; ++Lane) + for (unsigned Lane = 0; Lane < VF; ++Lane) packScalarIntoVectorValue(V, {Part, Lane}); VectorValue = VectorLoopValueMap.getVectorValue(V, Part); } @@ -2234,10 +2214,9 @@ void InnerLoopVectorizer::packScalarIntoVectorValue( Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); - assert(!VF.Scalable && "Cannot reverse scalable vectors"); SmallVector ShuffleMask; - for (unsigned i = 0; i < VF.Min; ++i) - ShuffleMask.push_back(VF.Min - i - 1); + for (unsigned i = 0; i < VF; ++i) + ShuffleMask.push_back(VF - i - 1); return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), ShuffleMask, "reverse"); @@ -2291,8 +2270,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); - assert(!VF.Scalable && "scalable vectors not yet supported."); - auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); + auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF); // Prepare for the new pointers. SmallVector AddrParts; @@ -2308,10 +2286,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. - assert(!VF.Scalable && - "scalable vector reverse operation is not implemented"); if (Group->isReverse()) - Index += (VF.Min - 1) * Group->getFactor(); + Index += (VF - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, {Part, 0}); @@ -2346,8 +2322,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *MaskForGaps = nullptr; if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { - assert(!VF.Scalable && "scalable vectors not yet supported."); - MaskForGaps = createBitMaskForGaps(Builder, VF.Min, *Group); + MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); } @@ -2364,11 +2339,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); - assert(!VF.Scalable && "scalable vectors not yet supported."); Value *ShuffledMask = Builder.CreateShuffleVector( BlockInMaskPart, Undefs, - createReplicatedMask(InterleaveFactor, VF.Min), - "interleaved.mask"); + createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) @@ -2394,16 +2367,14 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (!Member) continue; - assert(!VF.Scalable && "scalable vectors not yet supported."); - auto StrideMask = createStrideMask(I, InterleaveFactor, VF.Min); + auto StrideMask = createStrideMask(I, InterleaveFactor, VF); for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( NewLoads[Part], UndefVec, StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - assert(!VF.Scalable && "VF is assumed to be non scalable."); - VectorType *OtherVTy = VectorType::get(Member->getType(), VF); + VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } @@ -2417,8 +2388,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } // The sub vector type for current instruction. - assert(!VF.Scalable && "VF is assumed to be non scalable."); - auto *SubVT = VectorType::get(ScalarTy, VF); + auto *SubVT = FixedVectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. for (unsigned Part = 0; Part < UF; Part++) { @@ -2446,9 +2416,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *WideVec = concatenateVectors(Builder, StoredVecs); // Interleave the elements in the wide vector. - assert(!VF.Scalable && "scalable vectors not yet supported."); Value *IVec = Builder.CreateShuffleVector( - WideVec, UndefVec, createInterleaveMask(VF.Min, InterleaveFactor), + WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), "interleaved.vec"); Instruction *NewStoreInstr; @@ -2456,8 +2425,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *BlockInMaskPart = State.get(BlockInMask, Part); auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, Undefs, - createReplicatedMask(InterleaveFactor, VF.Min), "interleaved.mask"); + BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), + "interleaved.mask"); NewStoreInstr = Builder.CreateMaskedStore( IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); } @@ -2490,9 +2459,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, "CM decision is not to widen the memory instruction"); Type *ScalarDataTy = getMemInstValueType(Instr); - - assert(!VF.Scalable && "scalable vectors not yet supported."); - auto *DataTy = VectorType::get(ScalarDataTy, VF); + auto *DataTy = FixedVectorType::get(ScalarDataTy, VF); const Align Alignment = getLoadStoreAlignment(Instr); // Determine if the pointer operand of the access is either consecutive or @@ -2526,17 +2493,17 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (Reverse) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. - PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.Min))); + PartPtr = cast( + Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); PartPtr->setIsInBounds(InBounds); - PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.Min))); + PartPtr = cast( + Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { - PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, Ptr, Builder.getInt32(Part * VF.Min))); + PartPtr = cast( + Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); PartPtr->setIsInBounds(InBounds); } @@ -2732,9 +2699,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Type *Ty = TC->getType(); - // This is where we can make the step a runtime constant. - assert(!VF.Scalable && "scalable vectorization is not supported yet"); - Constant *Step = ConstantInt::get(Ty, VF.Min * UF); + Constant *Step = ConstantInt::get(Ty, VF * UF); // If the tail is to be folded by masking, round the number of iterations N // up to a multiple of Step instead of rounding down. This is done by first @@ -2743,10 +2708,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // that it starts at zero and its Step is a power of two; the loop will then // exit, with the last early-exit vector comparison also producing all-true. if (Cost->foldTailByMasking()) { - assert(isPowerOf2_32(VF.Min * UF) && + assert(isPowerOf2_32(VF * UF) && "VF*UF must be a power of 2 when folding tail by masking"); - TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF.Min * UF - 1), - "n.rnd.up"); + TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); } // Now we need to generate the expression for the part of the loop that the @@ -2763,7 +2727,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // does not evenly divide the trip count, no adjustment is necessary since // there will already be scalar iterations. Note that the minimum iterations // check ensures that N >= Step. - if (VF.isVector() && Cost->requiresScalarEpilogue()) { + if (VF > 1 && Cost->requiresScalarEpilogue()) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -2776,8 +2740,6 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL) { // Verify that V is a vector type with same number of elements as DstVTy. - assert(isa(DstVTy) && - "Vector type is assumed to be fixed width."); unsigned VF = DstVTy->getNumElements(); VectorType *SrcVecTy = cast(V->getType()); assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); @@ -2823,12 +2785,11 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); - if (!Cost->foldTailByMasking()) { - assert(!VF.Scalable && "scalable vectors not yet supported."); + if (!Cost->foldTailByMasking()) CheckMinIters = Builder.CreateICmp( - P, Count, ConstantInt::get(Count->getType(), VF.Min * UF), + P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); - } + // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, @@ -3281,8 +3242,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { Value *StartIdx = ConstantInt::get(IdxTy, 0); // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). - assert(!VF.Scalable && "scalable vectors not yet supported."); - Constant *Step = ConstantInt::get(IdxTy, VF.Min * UF); + Constant *Step = ConstantInt::get(IdxTy, VF * UF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); Induction = createInductionVariable(Lp, StartIdx, CountRoundDown, Step, @@ -3414,9 +3374,8 @@ static void cse(BasicBlock *BB) { } unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, - ElementCount VF, + unsigned VF, bool &NeedToScalarize) { - assert(!VF.Scalable && "scalable vectors not yet supported."); Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector Tys, ScalarTys; @@ -3429,7 +3388,7 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // value. unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); - if (VF.isScalar()) + if (VF == 1) return ScalarCallCost; // Compute corresponding vector type for return value and arguments. @@ -3441,12 +3400,13 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // packing the return values to a vector. unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); - unsigned Cost = ScalarCallCost * VF.Min + ScalarizationCost; + unsigned Cost = ScalarCallCost * VF + ScalarizationCost; // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. NeedToScalarize = true; - VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); + VFShape Shape = + VFShape::get(*CI, ElementCount::getFixed(VF), false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); if (!TLI || CI->isNoBuiltin() || !VecFunc) @@ -3463,7 +3423,7 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, } unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, - ElementCount VF) { + unsigned VF) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); @@ -3620,7 +3580,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { void InnerLoopVectorizer::fixVectorizedLoop() { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. - if (VF.isVector()) + if (VF > 1) truncateToMinimalBitwidths(); // Fix widened non-induction PHIs by setting up the PHI operands. @@ -3661,11 +3621,9 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // profile is not inherently precise anyway. Note also possible bypass of // vector code caused by legality checks is ignored, assigning all the weight // to the vector loop, optimistically. - assert(!VF.Scalable && - "cannot use scalable ElementCount to determine unroll factor"); setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), - LI->getLoopFor(LoopScalarBody), VF.Min * UF); + LI->getLoopFor(LoopScalarBody), VF * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs() { @@ -3744,12 +3702,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Create a vector from the initial value. auto *VectorInit = ScalarInit; - if (VF.isVector()) { + if (VF > 1) { Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - assert(!VF.Scalable && "VF is assumed to be non scalable."); VectorInit = Builder.CreateInsertElement( - UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, - Builder.getInt32(VF.Min - 1), "vector.recur.init"); + UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)), + VectorInit, Builder.getInt32(VF - 1), "vector.recur.init"); } // We constructed a temporary phi node in the first phase of vectorization. @@ -3790,11 +3747,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. - assert(!VF.Scalable); - SmallVector ShuffleMask(VF.Min); - ShuffleMask[0] = VF.Min - 1; - for (unsigned I = 1; I < VF.Min; ++I) - ShuffleMask[I] = I + VF.Min - 1; + SmallVector ShuffleMask(VF); + ShuffleMask[0] = VF - 1; + for (unsigned I = 1; I < VF; ++I) + ShuffleMask[I] = I + VF - 1; // The vector from which to take the initial value for the current iteration // (actual or unrolled). Initially, this is the vector phi node. @@ -3804,10 +3760,9 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { for (unsigned Part = 0; Part < UF; ++Part) { Value *PreviousPart = getOrCreateVectorValue(Previous, Part); Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); - auto *Shuffle = - VF.isVector() - ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) - : Incoming; + auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, + ShuffleMask) + : Incoming; PhiPart->replaceAllUsesWith(Shuffle); cast(PhiPart)->eraseFromParent(); VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); @@ -3820,10 +3775,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Extract the last vector element in the middle block. This will be the // initial value for the recurrence when jumping to the scalar loop. auto *ExtractForScalar = Incoming; - if (VF.isVector()) { + if (VF > 1) { Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); ExtractForScalar = Builder.CreateExtractElement( - ExtractForScalar, Builder.getInt32(VF.Min - 1), "vector.recur.extract"); + ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); } // Extract the second last element in the middle block if the // Phi is used outside the loop. We need to extract the phi itself @@ -3831,9 +3786,9 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // will be the value when jumping to the exit block from the LoopMiddleBlock, // when the scalar loop is not run at all. Value *ExtractForPhiUsedOutsideLoop = nullptr; - if (VF.isVector()) + if (VF > 1) ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( - Incoming, Builder.getInt32(VF.Min - 2), "vector.recur.extract.for.phi"); + Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); // When loop is unrolled without vectorizing, initialize // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of // `Incoming`. This is analogous to the vectorized case above: extracting the @@ -3912,7 +3867,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // incoming scalar reduction. VectorStart = ReductionStartValue; } else { - Identity = ConstantVector::getSplat(VF, Iden); + Identity = ConstantVector::getSplat(ElementCount::getFixed(VF), Iden); // This vector is the Identity vector where the first element is the // incoming scalar reduction. @@ -3988,10 +3943,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. - if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { + if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); - assert(!VF.Scalable && "scalable vectors not yet supported."); - Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); + Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF); Builder.SetInsertPoint( LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); VectorParts RdxParts(UF); @@ -4043,7 +3997,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // Create the reduction after the loop. Note that inloop reductions create the // target reduction in the loop using a Reduction recipe. - if (VF.isVector() && !IsInLoopReductionPhi) { + if (VF > 1 && !IsInLoopReductionPhi) { bool NoNaN = Legal->hasFunNoNaNAttr(); ReducedPartRdx = createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); @@ -4122,17 +4076,16 @@ void InnerLoopVectorizer::clearReductionWrapFlags( } void InnerLoopVectorizer::fixLCSSAPHIs() { - assert(!VF.Scalable && "the code below assumes fixed width vectors"); for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getNumIncomingValues() == 1) { auto *IncomingValue = LCSSAPhi.getIncomingValue(0); // Non-instruction incoming values will have only one value. unsigned LastLane = 0; - if (isa(IncomingValue)) - LastLane = Cost->isUniformAfterVectorization( - cast(IncomingValue), VF) - ? 0 - : VF.Min - 1; + if (isa(IncomingValue)) + LastLane = Cost->isUniformAfterVectorization( + cast(IncomingValue), VF) + ? 0 + : VF - 1; // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); @@ -4244,7 +4197,7 @@ void InnerLoopVectorizer::fixNonInductionPHIs() { } void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, - unsigned UF, ElementCount VF, + unsigned UF, unsigned VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State) { @@ -4254,7 +4207,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, // is vector-typed. Thus, to keep the representation compact, we only use // vector-typed operands for loop-varying values. - if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { // If we are vectorizing, but the GEP has only loop-invariant operands, // the GEP we build (by only using vector-typed operands for // loop-varying values) would be a scalar pointer. Thus, to ensure we @@ -4314,8 +4267,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, } void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, - ElementCount VF) { - assert(!VF.Scalable && "scalable vectors not yet supported."); + unsigned VF) { PHINode *P = cast(PN); if (EnableVPlanNativePath) { // Currently we enter here in the VPlan-native path for non-induction @@ -4323,7 +4275,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Create a vector phi with no operands - the vector phi operands will be // set at the end of vector code generation. Type *VecTy = - (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); + (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); VectorLoopValueMap.setVectorValue(P, 0, VecPhi); OrigPHIsToFix.push_back(P); @@ -4341,10 +4293,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { for (unsigned Part = 0; Part < UF; ++Part) { // This is phase one of vectorizing PHIs. - bool ScalarPHI = - (VF.isScalar()) || Cost->isInLoopReduction(cast(PN)); + bool ScalarPHI = (VF == 1) || Cost->isInLoopReduction(cast(PN)); Type *VecTy = - ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); + ScalarPHI ? PN->getType() : FixedVectorType::get(PN->getType(), VF); Value *EntryPart = PHINode::Create( VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); VectorLoopValueMap.setVectorValue(P, Part, EntryPart); @@ -4380,11 +4331,10 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.Min; + unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - Constant *Idx = - ConstantInt::get(PtrInd->getType(), Lane + Part * VF.Min); + Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); @@ -4414,8 +4364,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); Value *InductionGEP = GetElementPtrInst::Create( ScStValueType->getPointerElementType(), NewPointerPhi, - Builder.CreateMul(ScalarStepValue, - ConstantInt::get(PhiType, VF.Min * UF)), + Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)), "ptr.ind", InductionLoc); NewPointerPhi->addIncoming(InductionGEP, LoopLatch); @@ -4425,14 +4374,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, for (unsigned Part = 0; Part < UF; ++Part) { SmallVector Indices; // Create a vector of consecutive numbers from zero to VF. - for (unsigned i = 0; i < VF.Min; ++i) - Indices.push_back(ConstantInt::get(PhiType, i + Part * VF.Min)); + for (unsigned i = 0; i < VF; ++i) + Indices.push_back(ConstantInt::get(PhiType, i + Part * VF)); Constant *StartOffset = ConstantVector::get(Indices); Value *GEP = Builder.CreateGEP( ScStValueType->getPointerElementType(), NewPointerPhi, Builder.CreateMul(StartOffset, - Builder.CreateVectorSplat(VF.Min, ScalarStepValue), + Builder.CreateVectorSplat(VF, ScalarStepValue), "vector.gep")); VectorLoopValueMap.setVectorValue(P, Part, GEP); } @@ -4460,7 +4409,6 @@ static bool mayDivideByZero(Instruction &I) { void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, VPTransformState &State) { - assert(!VF.Scalable && "scalable vectors not yet supported."); switch (I.getOpcode()) { case Instruction::Call: case Instruction::Br: @@ -4548,9 +4496,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, setDebugLocFromInst(Builder, CI); /// Vectorize casts. - assert(!VF.Scalable && "VF is assumed to be non scalable."); Type *DestTy = - (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); + (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF); for (unsigned Part = 0; Part < UF; ++Part) { Value *A = State.get(User.getOperand(0), Part); @@ -4578,7 +4525,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, SmallVector Tys; for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.Min)); + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); @@ -4609,15 +4556,15 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, if (UseVectorIntrinsic) { // Use vector version of the intrinsic. Type *TysForDecl[] = {CI->getType()}; - if (VF.isVector()) { - assert(!VF.Scalable && "VF is assumed to be non scalable."); - TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); - } + if (VF > 1) + TysForDecl[0] = + FixedVectorType::get(CI->getType()->getScalarType(), VF); VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic."); } else { // Use vector version of the function call. - const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); + const VFShape Shape = VFShape::get(*CI, ElementCount::getFixed(VF), + false /*HasGlobalPred*/); #ifndef NDEBUG assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && "Can't create vector function."); @@ -4660,11 +4607,11 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, } } -void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { +void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does // this check. Collecting Scalars for VF=1 does not make any sense. - assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && + assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && "This function should not be visited twice for the same VF"); SmallSetVector Worklist; @@ -4847,9 +4794,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, - ElementCount VF) { - assert(!VF.Scalable && "scalable vectors not yet supported."); +bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { if (!blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { @@ -4863,7 +4808,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, auto *Ty = getMemInstValueType(I); // We have already decided how to vectorize this instruction, get that // result. - if (VF.isVector()) { + if (VF > 1) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); @@ -4884,8 +4829,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, return false; } -bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( - Instruction *I, ElementCount VF) { +bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, + unsigned VF) { assert(isAccessInterleaved(I) && "Expecting interleaved access."); assert(getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."); @@ -4921,8 +4866,8 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( : TTI.isLegalMaskedStore(Ty, Alignment); } -bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( - Instruction *I, ElementCount VF) { +bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, + unsigned VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast(I); StoreInst *SI = dyn_cast(I); @@ -4949,13 +4894,13 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( return true; } -void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { +void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which // already does this check. Collecting Uniforms for VF=1 does not make any // sense. - assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && + assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"); // Visit the list of Uniforms. If we'll not find any uniform value, we'll @@ -5006,7 +4951,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // Holds pointer operands of instructions that are possibly non-uniform. SmallPtrSet PossibleNonUniformPtrs; - auto isUniformDecision = [&](Instruction *I, ElementCount VF) { + auto isUniformDecision = [&](Instruction *I, unsigned VF) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); @@ -5303,10 +5248,10 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { (MaximizeBandwidth && isScalarEpilogueAllowed())) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). - SmallVector VFs; + SmallVector VFs; unsigned NewMaxVectorSize = WidestRegister / SmallestType; for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) - VFs.push_back(ElementCount::getFixed(VS)); + VFs.push_back(VS); // For each VF calculate its register usage. auto RUs = calculateRegisterUsage(VFs); @@ -5321,7 +5266,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { Selected = false; } if (Selected) { - MaxVF = VFs[i].Min; + MaxVF = VFs[i]; break; } } @@ -5338,7 +5283,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { - float Cost = expectedCost(ElementCount::getFixed(1)).first; + float Cost = expectedCost(1).first; const float ScalarCost = Cost; unsigned Width = 1; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); @@ -5355,7 +5300,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. - VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); + VectorizationCostTy C = expectedCost(i); float VectorCost = C.first / (float)i; LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); @@ -5383,8 +5328,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); - VectorizationFactor Factor = {ElementCount::getFixed(Width), - (unsigned)(Width * Cost)}; + VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; return Factor; } @@ -5444,7 +5388,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { return {MinWidth, MaxWidth}; } -unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, +unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, unsigned LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -5522,8 +5466,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, } // Clamp the interleave ranges to reasonable counts. - assert(!VF.Scalable && "scalable vectors not yet supported."); - unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.Min); + unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); // Check if the user has overridden the max. if (VF == 1) { @@ -5537,7 +5480,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // If trip count is known or estimated compile time constant, limit the // interleave count to be less than the trip count divided by VF. if (BestKnownTC) { - MaxInterleaveCount = std::min(*BestKnownTC / VF.Min, MaxInterleaveCount); + MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); } // If we did not calculate the cost for VF (because the user selected the VF) @@ -5556,7 +5499,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF.isVector() && !Legal->getReductionVars().empty()) { + if (VF > 1 && !Legal->getReductionVars().empty()) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -5564,7 +5507,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // Note that if we've already vectorized the loop we will have done the // runtime check and so interleaving won't require further checks. bool InterleavingRequiresRuntimePointerCheck = - (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); + (VF == 1 && Legal->getRuntimePointerChecking()->Need); // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. @@ -5618,7 +5561,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, } SmallVector -LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { +LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -5705,12 +5648,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); // A lambda that gets the register usage for the given type and VF. - auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) { + auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { if (Ty->isTokenTy()) return 0U; unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); - assert(!VF.Scalable && "scalable vectors not yet supported."); - return std::max(1, VF.Min * TypeSize / WidestRegister); + return std::max(1, VF * TypeSize / WidestRegister); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -5734,7 +5676,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // Count the number of live intervals. SmallMapVector RegUsage; - if (VFs[j].isScalar()) { + if (VFs[j] == 1) { for (auto Inst : OpenIntervals) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); if (RegUsage.find(ClassID) == RegUsage.end()) @@ -5783,10 +5725,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { SmallMapVector Invariant; for (auto Inst : LoopInvariants) { - unsigned Usage = - VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); - unsigned ClassID = - TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); + unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); if (Invariant.find(ClassID) == Invariant.end()) Invariant[ClassID] = Usage; else @@ -5834,13 +5774,12 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ NumPredStores > NumberOfStoresToPredicate); } -void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { +void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { // If we aren't vectorizing the loop, or if we've already collected the // instructions to scalarize, there's nothing to do. Collection may already // have occurred if we have a user-selected VF and are now computing the // expected cost for interleaving. - if (VF.isScalar() || VF.isZero() || - InstsToScalarize.find(VF) != InstsToScalarize.end()) + if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) return; // Initialize a mapping for VF in InstsToScalalarize. If we find that it's @@ -5870,7 +5809,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { int LoopVectorizationCostModel::computePredInstDiscount( Instruction *PredInst, DenseMap &ScalarCosts, - ElementCount VF) { + unsigned VF) { assert(!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"); @@ -5937,20 +5876,16 @@ int LoopVectorizationCostModel::computePredInstDiscount( // the instruction as if it wasn't if-converted and instead remained in the // predicated block. We will scale this cost by block probability after // computing the scalarization overhead. - assert(!VF.Scalable && "scalable vectors not yet supported."); - unsigned ScalarCost = - VF.Min * getInstructionCost(I, ElementCount::getFixed(1)).first; + unsigned ScalarCost = VF * getInstructionCost(I, 1).first; // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), - APInt::getAllOnesValue(VF.Min), true, false); - assert(!VF.Scalable && "scalable vectors not yet supported."); - ScalarCost += - VF.Min * - TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); + APInt::getAllOnesValue(VF), true, false); + ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI, + TTI::TCK_RecipThroughput); } // Compute the scalarization overhead of needed extractelement @@ -5963,12 +5898,10 @@ int LoopVectorizationCostModel::computePredInstDiscount( "Instruction has non-scalar type"); if (canBeScalarized(J)) Worklist.push_back(J); - else if (needsExtract(J, VF)) { - assert(!VF.Scalable && "scalable vectors not yet supported."); + else if (needsExtract(J, VF)) ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(J->getType(), VF)), - APInt::getAllOnesValue(VF.Min), false, true); - } + APInt::getAllOnesValue(VF), false, true); } // Scale the total scalar cost by block probability. @@ -5984,8 +5917,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::expectedCost(ElementCount VF) { - assert(!VF.Scalable && "scalable vectors not yet supported."); +LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; // For each block. @@ -5995,8 +5927,7 @@ LoopVectorizationCostModel::expectedCost(ElementCount VF) { // For each instruction in the old loop. for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. - if (ValuesToIgnore.count(&I) || - (VF.isVector() && VecValuesToIgnore.count(&I))) + if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) continue; VectorizationCostTy C = getInstructionCost(&I, VF); @@ -6018,7 +5949,7 @@ LoopVectorizationCostModel::expectedCost(ElementCount VF) { // unconditionally executed. For the scalar case, we may not always execute // the predicated block. Thus, scale the block's cost by the probability of // executing it. - if (VF.isScalar() && blockNeedsPredication(BB)) + if (VF == 1 && blockNeedsPredication(BB)) BlockCost.first /= getReciprocalPredBlockProb(); Cost.first += BlockCost.first; @@ -6063,12 +5994,9 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { Legal->hasStride(I->getOperand(1)); } -unsigned -LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, - ElementCount VF) { - assert(VF.isVector() && - "Scalarization cost of instruction implies vectorization."); - assert(!VF.Scalable && "scalable vectors not yet supported."); +unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, + unsigned VF) { + assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); Type *ValTy = getMemInstValueType(I); auto SE = PSE.getSE(); @@ -6081,14 +6009,14 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); // Get the cost of the scalar memory instruction and address computation. - unsigned Cost = VF.Min * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. const Align Alignment = getLoadStoreAlignment(I); - Cost += VF.Min * - TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, - AS, TTI::TCK_RecipThroughput); + Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), + Alignment, AS, + TTI::TCK_RecipThroughput); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -6110,7 +6038,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, } unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, - ElementCount VF) { + unsigned VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); @@ -6136,7 +6064,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, } unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, - ElementCount VF) { + unsigned VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -6154,13 +6082,14 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) + - (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( - Instruction::ExtractElement, - VectorTy, VF.Min - 1)); + (isLoopInvariantStoreValue + ? 0 + : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, + VF - 1)); } unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, - ElementCount VF) { + unsigned VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -6173,7 +6102,7 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, } unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, - ElementCount VF) { + unsigned VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); @@ -6182,8 +6111,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); - assert(!VF.Scalable && "scalable vectors not yet supported."); - auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); + auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor); // Holds the indices of existing members in an interleaved load group. // An interleaved store group doesn't need this as it doesn't allow gaps. @@ -6212,10 +6140,10 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, } unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, - ElementCount VF) { + unsigned VF) { // Calculate scalar cost only. Vectorization cost should be ready at this // moment. - if (VF.isScalar()) { + if (VF == 1) { Type *ValTy = getMemInstValueType(I); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); @@ -6228,42 +6156,35 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::getInstructionCost(Instruction *I, - ElementCount VF) { - assert(!VF.Scalable && - "the cost model is not yet implemented for scalable vectorization"); +LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (isUniformAfterVectorization(I, VF)) - VF = ElementCount::getFixed(1); + VF = 1; - if (VF.isVector() && isProfitableToScalarize(I, VF)) + if (VF > 1 && isProfitableToScalarize(I, VF)) return VectorizationCostTy(InstsToScalarize[VF][I], false); // Forced scalars do not have any scalarization overhead. auto ForcedScalar = ForcedScalars.find(VF); - if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { + if (VF > 1 && ForcedScalar != ForcedScalars.end()) { auto InstSet = ForcedScalar->second; if (InstSet.count(I)) - return VectorizationCostTy( - (getInstructionCost(I, ElementCount::getFixed(1)).first * VF.Min), - false); + return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); } Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); - bool TypeNotScalarized = VF.isVector() && VectorTy->isVectorTy() && - TTI.getNumberOfParts(VectorTy) < VF.Min; + bool TypeNotScalarized = + VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; return VectorizationCostTy(C, TypeNotScalarized); } unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, - ElementCount VF) { + unsigned VF) { - assert(!VF.Scalable && - "cannot compute scalarization overhead for scalable vectorization"); - if (VF.isScalar()) + if (VF == 1) return 0; unsigned Cost = 0; @@ -6271,7 +6192,7 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, if (!RetTy->isVoidTy() && (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( - cast(RetTy), APInt::getAllOnesValue(VF.Min), true, false); + cast(RetTy), APInt::getAllOnesValue(VF), true, false); // Some targets keep addresses scalar. if (isa(I) && !TTI.prefersVectorizedAddressing()) @@ -6287,14 +6208,12 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, // Skip operands that do not require extraction/scalarization and do not incur // any overhead. - return Cost + - TTI.getOperandsScalarizationOverhead(filterExtractingOperands(Ops, VF), - VF.Min); + return Cost + TTI.getOperandsScalarizationOverhead( + filterExtractingOperands(Ops, VF), VF); } -void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { - assert(!VF.Scalable && "scalable vectors not yet supported."); - if (VF.isScalar()) +void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { + if (VF == 1) return; NumPredStores = 0; for (BasicBlock *BB : TheLoop->blocks()) { @@ -6428,17 +6347,14 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { InstWidening Decision = getWideningDecision(I, VF); if (Decision == CM_Widen || Decision == CM_Widen_Reverse) // Scalarize a widened load of address. - setWideningDecision( - I, VF, CM_Scalarize, - (VF.Min * getMemoryInstructionCost(I, ElementCount::getFixed(1)))); + setWideningDecision(I, VF, CM_Scalarize, + (VF * getMemoryInstructionCost(I, 1))); else if (auto Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { if (Instruction *Member = Group->getMember(I)) - setWideningDecision( - Member, VF, CM_Scalarize, - (VF.Min * - getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); + setWideningDecision(Member, VF, CM_Scalarize, + (VF * getMemoryInstructionCost(Member, 1))); } } } else @@ -6449,7 +6365,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { } unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, - ElementCount VF, + unsigned VF, Type *&VectorTy) { Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) @@ -6472,20 +6388,19 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // blocks requires also an extract of its vector compare i1 element. bool ScalarPredicatedBB = false; BranchInst *BI = cast(I); - if (VF.isVector() && BI->isConditional() && + if (VF > 1 && BI->isConditional() && (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { // Return cost for branches around scalarized and predicated blocks. - assert(!VF.Scalable && "scalable vectors not yet supported."); auto *Vec_i1Ty = - VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); - return (TTI.getScalarizationOverhead( - Vec_i1Ty, APInt::getAllOnesValue(VF.Min), false, true) + - (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.Min)); - } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) + FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); + return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), + false, true) + + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF)); + } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) // The back-edge branch will remain, as will all scalar branches. return TTI.getCFInstrCost(Instruction::Br, CostKind); else @@ -6500,15 +6415,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // First-order recurrences are replaced by vector shuffles inside the loop. // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. - if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) + if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - cast(VectorTy), VF.Min - 1, + cast(VectorTy), VF - 1, FixedVectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi // node, where N is the number of incoming values. - if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) + if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) return (Phi->getNumIncomingValues() - 1) * TTI.getCmpSelInstrCost( Instruction::Select, ToVectorTy(Phi->getType(), VF), @@ -6525,18 +6440,17 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF.isVector() && isScalarWithPredication(I)) { + if (VF > 1 && isScalarWithPredication(I)) { unsigned Cost = 0; // These instructions have a non-void type, so account for the phi nodes // that we will create. This cost is likely to be zero. The phi node // cost, if any, should be scaled by the block probability because it // models a copy at the end of each predicated block. - Cost += VF.Min * TTI.getCFInstrCost(Instruction::PHI, CostKind); + Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind); // The cost of the non-predicated instruction. - Cost += - VF.Min * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); + Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); // The cost of insertelement and extractelement instructions needed for // scalarization. @@ -6575,15 +6489,14 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Op2VK = TargetTransformInfo::OK_UniformValue; SmallVector Operands(I->operand_values()); - unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; + unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { - assert(!VF.Scalable && "VF is assumed to be non scalable."); - unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; + unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, @@ -6596,10 +6509,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); - if (!ScalarCond) { - assert(!VF.Scalable && "VF is assumed to be non scalable."); - CondTy = VectorType::get(CondTy, VF); - } + if (!ScalarCond) + CondTy = FixedVectorType::get(CondTy, VF); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, CostKind, I); } @@ -6615,13 +6527,13 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } case Instruction::Store: case Instruction::Load: { - ElementCount Width = VF; - if (Width.isVector()) { + unsigned Width = VF; + if (Width > 1) { InstWidening Decision = getWideningDecision(I, Width); assert(Decision != CM_Unknown && "CM decision should be taken at this point"); if (Decision == CM_Scalarize) - Width = ElementCount::getFixed(1); + Width = 1; } VectorTy = ToVectorTy(getMemInstValueType(I), Width); return getMemoryInstructionCost(I, VF); @@ -6643,7 +6555,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, assert((isa(I) || isa(I)) && "Expected a load or a store!"); - if (VF.isScalar() || !TheLoop->contains(I)) + if (VF == 1 || !TheLoop->contains(I)) return TTI::CastContextHint::Normal; switch (getWideningDecision(I, VF)) { @@ -6709,8 +6621,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } } - assert(!VF.Scalable && "VF is assumed to be non scalable"); - unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; + unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; return N * TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } @@ -6725,9 +6636,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - return VF.Min * - TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, - CostKind) + + return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, + CostKind) + getScalarizationOverhead(I, VF); } // end of switch. } @@ -6833,9 +6743,8 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, } VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { - assert(!UserVF.Scalable && "scalable vectors not yet supported"); - ElementCount VF = UserVF; +LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { + unsigned VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in @@ -6843,29 +6752,28 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { if (!OrigLoop->empty()) { // If the user doesn't provide a vectorization factor, determine a // reasonable one. - if (UserVF.isZero()) { - VF = ElementCount::getFixed( - determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); + if (!UserVF) { + VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); // Make sure we have a VF > 1 for stress testing. - if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { + if (VPlanBuildStressTest && VF < 2) { LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " << "overriding computed VF.\n"); - VF = ElementCount::getFixed(4); + VF = 4; } } assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - assert(isPowerOf2_32(VF.Min) && "VF needs to be a power of two"); - LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") - << "VF " << VF << " to build VPlans.\n"); - buildVPlans(VF.Min, VF.Min); + assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); + LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF + << " to build VPlans.\n"); + buildVPlans(VF, VF); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0 /*Cost*/}; + return {VF, 0}; } LLVM_DEBUG( @@ -6874,11 +6782,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { return VectorizationFactor::Disabled(); } -Optional -LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { - assert(!UserVF.Scalable && "scalable vectorization not yet handled"); +Optional LoopVectorizationPlanner::plan(unsigned UserVF, + unsigned UserIC) { assert(OrigLoop->empty() && "Inner loop expected."); - Optional MaybeMaxVF = CM.computeMaxVF(UserVF.Min, UserIC); + Optional MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -6896,14 +6803,14 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { CM.invalidateCostModelingDecisions(); } - if (!UserVF.isZero()) { + if (UserVF) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - assert(isPowerOf2_32(UserVF.Min) && "VF needs to be a power of two"); + assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF.Min, UserVF.Min); + buildVPlansWithVPRecipes(UserVF, UserVF); LLVM_DEBUG(printPlans(dbgs())); return {{UserVF, 0}}; } @@ -6913,12 +6820,12 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { // Collect Uniform and Scalar instructions after vectorization with VF. - CM.collectUniformsAndScalars(ElementCount::getFixed(VF)); + CM.collectUniformsAndScalars(VF); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. if (VF > 1) - CM.collectInstsToScalarize(ElementCount::getFixed(VF)); + CM.collectInstsToScalarize(VF); } CM.collectInLoopReductions(); @@ -6932,7 +6839,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return CM.selectVectorizationFactor(MaxVF); } -void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { +void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); BestVF = VF; @@ -6951,11 +6858,9 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, // 1. Create a new empty loop. Unlink the old loop and connect the new one. VPCallbackILV CallbackILV(ILV); - assert(BestVF.hasValue() && "Vectorization Factor is missing"); - - VPTransformState State{*BestVF, BestUF, LI, - DT, ILV.Builder, ILV.VectorLoopValueMap, - &ILV, CallbackILV}; + VPTransformState State{BestVF, BestUF, LI, + DT, ILV.Builder, ILV.VectorLoopValueMap, + &ILV, CallbackILV}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; @@ -7069,12 +6974,12 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } bool LoopVectorizationPlanner::getDecisionAndClampRange( - const std::function &Predicate, VFRange &Range) { + const std::function &Predicate, VFRange &Range) { assert(Range.End > Range.Start && "Trying to test an empty VF range."); - bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start)); + bool PredicateAtRangeStart = Predicate(Range.Start); for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) - if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) { + if (Predicate(TmpVF) != PredicateAtRangeStart) { Range.End = TmpVF; break; } @@ -7185,9 +7090,8 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, assert((isa(I) || isa(I)) && "Must be called with either a load or store"); - auto willWiden = [&](ElementCount VF) -> bool { - assert(!VF.Scalable && "unexpected scalable ElementCount"); - if (VF.isScalar()) + auto willWiden = [&](unsigned VF) -> bool { + if (VF == 1) return false; LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); @@ -7240,10 +7144,9 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, // Determine whether \p K is a truncation based on an induction variable that // can be optimized. auto isOptimizableIVTruncate = - [&](Instruction *K) -> std::function { - return [=](ElementCount VF) -> bool { - return CM.isOptimizableIVTruncate(K, VF); - }; + [&](Instruction *K) -> std::function { + return + [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; }; if (LoopVectorizationPlanner::getDecisionAndClampRange( @@ -7278,9 +7181,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, VPlan &Plan) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](ElementCount VF) { - return CM.isScalarWithPredication(CI, VF); - }, + [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, Range); if (IsPredicated) @@ -7291,7 +7192,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) return nullptr; - auto willWiden = [&](ElementCount VF) -> bool { + auto willWiden = [&](unsigned VF) -> bool { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // The following case may be scalarized depending on the VF. // The flag shows whether we use Intrinsic or a usual Call for vectorized @@ -7315,7 +7216,7 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { !isa(I) && "Instruction should have been handled earlier"); // Instruction should be widened, unless it is scalar after vectorization, // scalarization is profitable or it is predicated. - auto WillScalarize = [this, I](ElementCount VF) -> bool { + auto WillScalarize = [this, I](unsigned VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I, VF); @@ -7378,12 +7279,11 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( DenseMap &PredInst2Recipe, VPlanPtr &Plan) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, + [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, - Range); + [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), IsUniform, IsPredicated); @@ -7591,8 +7491,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](ElementCount VF) -> bool { - return (VF.isVector() && // Query is illegal for VF == 1 + auto applyIG = [IG, this](unsigned VF) -> bool { + return (VF >= 2 && // Query is illegal for VF == 1 CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); }; @@ -7717,10 +7617,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( std::string PlanName; raw_string_ostream RSO(PlanName); - ElementCount VF = ElementCount::getFixed(Range.Start); + unsigned VF = Range.Start; Plan->addVF(VF); RSO << "Initial VPlan for VF={" << VF; - for (VF.Min *= 2; VF.Min < Range.End; VF.Min *= 2) { + for (VF *= 2; VF < Range.End; VF *= 2) { Plan->addVF(VF); RSO << "," << VF; } @@ -7747,7 +7647,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { HCFGBuilder.buildHierarchicalCFG(); for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) - Plan->addVF(ElementCount::getFixed(VF)); + Plan->addVF(VF); if (EnableVPlanPredication) { VPlanPredicator VPP(*Plan); @@ -7941,12 +7841,11 @@ void VPReplicateRecipe::execute(VPTransformState &State) { State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, IsPredicated, State); // Insert scalar instance packing it into a vector. - if (AlsoPack && State.VF.isVector()) { + if (AlsoPack && State.VF > 1) { // If we're constructing lane 0, initialize to start from undef. if (State.Instance->Lane == 0) { - assert(!State.VF.Scalable && "VF is assumed to be non scalable."); - Value *Undef = - UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); + Value *Undef = UndefValue::get( + FixedVectorType::get(Ingredient->getType(), State.VF)); State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); } State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); @@ -7957,7 +7856,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { // Generate scalar instances for all VF lanes of all UF parts, unless the // instruction is uniform inwhich case generate only the first lane for each // of the UF parts. - unsigned EndLane = IsUniform ? 1 : State.VF.Min; + unsigned EndLane = IsUniform ? 1 : State.VF; for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, @@ -8103,8 +8002,7 @@ static bool processLoopInVPlanNativePath( const unsigned UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = - LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); + const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. @@ -8270,8 +8168,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = - LVP.plan(ElementCount::getFixed(UserVF), UserIC); + Optional MaybeVF = LVP.plan(UserVF, UserIC); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1358f9d37c87..302a4845e9a8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -300,8 +300,7 @@ void VPRegionBlock::execute(VPTransformState *State) { for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { State->Instance->Part = Part; - assert(!State->VF.Scalable && "VF is assumed to be non scalable."); - for (unsigned Lane = 0, VF = State->VF.Min; Lane < VF; ++Lane) { + for (unsigned Lane = 0, VF = State->VF; Lane < VF; ++Lane) { State->Instance->Lane = Lane; // Visit the VPBlocks connected to \p this, starting from it. for (VPBlockBase *Block : RPOT) { @@ -388,7 +387,7 @@ void VPInstruction::generateInstruction(VPTransformState &State, Value *ScalarBTC = State.get(getOperand(1), {Part, 0}); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.Min); + auto *PredTy = FixedVectorType::get(Int1Ty, State.VF); Instruction *Call = Builder.CreateIntrinsic( Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()}, {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask"); @@ -839,15 +838,14 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { Value *CanonicalIV = State.CanonicalIV; Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); - ElementCount VF = State.VF; - assert(!VF.Scalable && "the code following assumes non scalables ECs"); - Value *VStart = VF.isScalar() ? CanonicalIV - : Builder.CreateVectorSplat(VF.Min, CanonicalIV, - "broadcast"); + auto VF = State.VF; + Value *VStart = VF == 1 + ? CanonicalIV + : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { SmallVector Indices; - for (unsigned Lane = 0; Lane < VF.Min; ++Lane) - Indices.push_back(ConstantInt::get(STy, Part * VF.Min + Lane)); + for (unsigned Lane = 0; Lane < VF; ++Lane) + Indices.push_back(ConstantInt::get(STy, Part * VF + Lane)); // If VF == 1, there is only one iteration in the loop above, thus the // element pushed back into Indices is ConstantInt::get(STy, Part) Constant *VStep = VF == 1 ? Indices.back() : ConstantVector::get(Indices); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 6eed236fc149..54700cb48839 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -115,7 +115,7 @@ private: /// The vectorization factor. Each entry in the scalar map contains UF x VF /// scalar values. - ElementCount VF; + unsigned VF; /// The vector and scalar map storage. We use std::map and not DenseMap /// because insertions to DenseMap invalidate its iterators. @@ -126,7 +126,7 @@ private: public: /// Construct an empty map with the given unroll and vectorization factors. - VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {} + VectorizerValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} /// \return True if the map has any vector entry for \p Key. bool hasAnyVectorValue(Value *Key) const { @@ -151,14 +151,12 @@ public: /// \return True if the map has a scalar entry for \p Key and \p Instance. bool hasScalarValue(Value *Key, const VPIteration &Instance) const { assert(Instance.Part < UF && "Queried Scalar Part is too large."); - assert(Instance.Lane < VF.Min && "Queried Scalar Lane is too large."); - assert(!VF.Scalable && "VF is assumed to be non scalable."); - + assert(Instance.Lane < VF && "Queried Scalar Lane is too large."); if (!hasAnyScalarValue(Key)) return false; const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Instance.Part].size() == VF.Min && + assert(Entry[Instance.Part].size() == VF && "ScalarParts has wrong dimensions."); return Entry[Instance.Part][Instance.Lane] != nullptr; } @@ -197,7 +195,7 @@ public: // TODO: Consider storing uniform values only per-part, as they occupy // lane 0 only, keeping the other VF-1 redundant entries null. for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF.Min, nullptr); + Entry[Part].resize(VF, nullptr); ScalarMapStorage[Key] = Entry; } ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; @@ -236,15 +234,14 @@ struct VPCallback { /// VPTransformState holds information passed down when "executing" a VPlan, /// needed for generating the output IR. struct VPTransformState { - VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, - DominatorTree *DT, IRBuilder<> &Builder, - VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV, - VPCallback &Callback) + VPTransformState(unsigned VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, + IRBuilder<> &Builder, VectorizerValueMap &ValueMap, + InnerLoopVectorizer *ILV, VPCallback &Callback) : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {} /// The chosen Vectorization and Unroll Factors of the loop being vectorized. - ElementCount VF; + unsigned VF; unsigned UF; /// Hold the indices to generate specific scalar instructions. Null indicates @@ -1586,7 +1583,7 @@ class VPlan { VPBlockBase *Entry; /// Holds the VFs applicable to this VPlan. - SmallSetVector VFs; + SmallSet VFs; /// Holds the name of the VPlan, for printing. std::string Name; @@ -1650,9 +1647,9 @@ public: return BackedgeTakenCount; } - void addVF(ElementCount VF) { VFs.insert(VF); } + void addVF(unsigned VF) { VFs.insert(VF); } - bool hasVF(ElementCount VF) { return VFs.count(VF); } + bool hasVF(unsigned VF) { return VFs.count(VF); } const std::string &getName() const { return Name; }