diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 06d354411af6..a3e624842700 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -128,6 +128,11 @@ public: IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, unsigned Factor); + IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, + ElementCount Factor) + : IntrinsicCostAttributes(Id, CI, Factor.Min) { + assert(!Factor.Scalable); + } IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, unsigned Factor, unsigned ScalarCost); diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index f77048d45d01..527bba67b257 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -300,13 +300,17 @@ namespace Intrinsic { typedef unsigned ID; } -/// A helper function for converting Scalar types to vector types. -/// If the incoming type is void, we return void. If the VF is 1, we return -/// the scalar type. -inline Type *ToVectorTy(Type *Scalar, unsigned VF, bool isScalable = false) { - if (Scalar->isVoidTy() || VF == 1) +/// A helper function for converting Scalar types to vector types. If +/// the incoming type is void, we return void. If the EC represents a +/// scalar, we return the scalar type. +inline Type *ToVectorTy(Type *Scalar, ElementCount EC) { + if (Scalar->isVoidTy() || EC.isScalar()) return Scalar; - return VectorType::get(Scalar, ElementCount::get(VF, isScalable)); + return VectorType::get(Scalar, EC); +} + +inline Type *ToVectorTy(Type *Scalar, unsigned VF) { + return ToVectorTy(Scalar, ElementCount::getFixed(VF)); } /// Identify if the intrinsic is trivially vectorizable. diff --git a/llvm/include/llvm/IR/DiagnosticInfo.h b/llvm/include/llvm/IR/DiagnosticInfo.h index b7e0ecde8629..33736321b42b 100644 --- a/llvm/include/llvm/IR/DiagnosticInfo.h +++ b/llvm/include/llvm/IR/DiagnosticInfo.h @@ -21,6 +21,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/IR/DebugLoc.h" #include "llvm/Support/CBindingWrapping.h" +#include "llvm/Support/TypeSize.h" #include "llvm/Support/YAMLTraits.h" #include #include @@ -434,6 +435,7 @@ public: Argument(StringRef Key, unsigned N); Argument(StringRef Key, unsigned long N); Argument(StringRef Key, unsigned long long N); + Argument(StringRef Key, ElementCount EC); Argument(StringRef Key, bool B) : Key(Key), Val(B ? "true" : "false") {} Argument(StringRef Key, DebugLoc dl); }; diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h index a7f5b849bcc1..8b346ad673d8 100644 --- a/llvm/include/llvm/Support/TypeSize.h +++ b/llvm/include/llvm/Support/TypeSize.h @@ -67,8 +67,33 @@ public: static ElementCount get(unsigned Min, bool Scalable) { return {Min, Scalable}; } + + /// Printing function. + void print(raw_ostream &OS) const { + if (Scalable) + OS << "vscale x "; + OS << Min; + } + /// Counting predicates. + /// + /// Notice that Min = 1 and Scalable = true is considered more than + /// one element. + /// + ///@{ No elements.. + bool isZero() const { return Min == 0; } + /// Exactly one element. + bool isScalar() const { return !Scalable && Min == 1; } + /// One or more elements. + bool isVector() const { return (Scalable && Min != 0) || Min > 1; } + ///@} }; +/// Stream operator function for `ElementCount`. +inline raw_ostream &operator<<(raw_ostream &OS, const ElementCount &EC) { + EC.print(OS); + return OS; +} + // This class is used to represent the size of types. If the type is of fixed // size, it will represent the exact size. If the type is a scalable vector, // it will represent the known minimum size. diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp index 6528c723fbfa..28882cfa8f65 100644 --- a/llvm/lib/IR/DiagnosticInfo.cpp +++ b/llvm/lib/IR/DiagnosticInfo.cpp @@ -213,6 +213,13 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, unsigned long long N) : Key(std::string(Key)), Val(utostr(N)) {} +DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, + ElementCount EC) + : Key(std::string(Key)) { + raw_string_ostream OS(Val); + EC.print(OS); +} + DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, DebugLoc Loc) : Key(std::string(Key)), Loc(Loc) { if (Loc) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index ecf6c8402cd6..8c3dff69e072 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -172,12 +172,14 @@ public: /// Information about vectorization costs struct VectorizationFactor { // Vector width with best cost - unsigned Width; + ElementCount Width; // Cost of the loop with that width unsigned Cost; // Width 1 means no vectorization, cost 0 means uncomputed cost. - static VectorizationFactor Disabled() { return {1, 0}; } + static VectorizationFactor Disabled() { + return {ElementCount::getFixed(1), 0}; + } bool operator==(const VectorizationFactor &rhs) const { return Width == rhs.Width && Cost == rhs.Cost; @@ -227,7 +229,10 @@ class LoopVectorizationPlanner { /// A builder used to construct the current plan. VPBuilder Builder; - unsigned BestVF = 0; + /// The best number of elements of the vector types used in the + /// transformed loop. BestVF = None means that vectorization is + /// disabled. + Optional BestVF = None; unsigned BestUF = 0; public: @@ -242,14 +247,14 @@ public: /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional plan(unsigned UserVF, unsigned UserIC); + Optional plan(ElementCount UserVF, unsigned UserIC); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. - VectorizationFactor planInVPlanNativePath(unsigned UserVF); + VectorizationFactor planInVPlanNativePath(ElementCount UserVF); /// Finalize the best decision and dispose of all other VPlans. - void setBestPlan(unsigned VF, unsigned UF); + void setBestPlan(ElementCount VF, unsigned UF); /// Generate the IR code for the body of the vectorized loop according to the /// best selected VPlan. @@ -264,7 +269,7 @@ public: /// \p Predicate on Range.Start, possibly decreasing Range.End such that the /// returned value holds for the entire \p Range. static bool - getDecisionAndClampRange(const std::function &Predicate, + getDecisionAndClampRange(const std::function &Predicate, VFRange &Range); protected: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 86f15500d838..ecc41db21a9a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -318,11 +318,12 @@ static Type *getMemInstValueType(Value *I) { /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type at the given vectorization factor. -static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { +static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { + assert(!VF.Scalable && "scalable vectors not yet supported."); // Determine if an array of VF elements of type Ty is "bitcast compatible" // with a vector. - if (VF > 1) { - auto *VectorTy = FixedVectorType::get(Ty, VF); + if (VF.isVector()) { + auto *VectorTy = VectorType::get(Ty, VF); return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); } @@ -404,7 +405,7 @@ public: LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, unsigned VecWidth, + OptimizationRemarkEmitter *ORE, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) @@ -454,13 +455,13 @@ public: /// Vectorize a single GetElementPtrInst based on information gathered and /// decisions taken during planning. void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, - unsigned VF, bool IsPtrLoopInvariant, + ElementCount VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); + void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane @@ -748,7 +749,7 @@ protected: /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. - unsigned VF; + ElementCount VF; /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. @@ -837,8 +838,9 @@ public: LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) - : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, - UnrollFactor, LVL, CM, BFI, PSI) {} + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + ElementCount::getFixed(1), UnrollFactor, LVL, CM, + BFI, PSI) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -874,7 +876,8 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) const DILocation *DIL = Inst->getDebugLoc(); if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && !isa(Inst)) { - auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); + assert(!VF.Scalable && "scalable vectors not yet supported."); + auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF.Min); if (NewDIL) B.SetCurrentDebugLocation(NewDIL.getValue()); else @@ -1039,7 +1042,7 @@ public: VectorizationFactor selectVectorizationFactor(unsigned MaxVF); /// Setup cost-based decisions for user vectorization factor. - void selectUserVectorizationFactor(unsigned UserVF) { + void selectUserVectorizationFactor(ElementCount UserVF) { collectUniformsAndScalars(UserVF); collectInstsToScalarize(UserVF); } @@ -1053,7 +1056,7 @@ public: /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); + unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1062,7 +1065,7 @@ public: /// the lists of loop-uniform and loop-scalar instructions. /// The calculated cost is saved with widening decision in order to /// avoid redundant calculations. - void setCostBasedWideningDecision(unsigned VF); + void setCostBasedWideningDecision(ElementCount VF); /// A struct that represents some properties of the register usage /// of a loop. @@ -1077,7 +1080,8 @@ public: /// \return Returns information about the register usages of the loop for the /// given vectorization factors. - SmallVector calculateRegisterUsage(ArrayRef VFs); + SmallVector + calculateRegisterUsage(ArrayRef VFs); /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); @@ -1095,8 +1099,9 @@ public: /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. - bool isProfitableToScalarize(Instruction *I, unsigned VF) const { - assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); + bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { + assert(VF.isVector() && + "Profitable to scalarize relevant only for VF > 1."); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. @@ -1110,8 +1115,8 @@ public: } /// Returns true if \p I is known to be uniform after vectorization. - bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { - if (VF == 1) + bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { + if (VF.isScalar()) return true; // Cost model is not run in the VPlan-native path - return conservative @@ -1126,8 +1131,8 @@ public: } /// Returns true if \p I is known to be scalar after vectorization. - bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { - if (VF == 1) + bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { + if (VF.isScalar()) return true; // Cost model is not run in the VPlan-native path - return conservative @@ -1143,8 +1148,8 @@ public: /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. - bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { - return VF > 1 && MinBWs.find(I) != MinBWs.end() && + bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { + return VF.isVector() && MinBWs.find(I) != MinBWs.end() && !isProfitableToScalarize(I, VF) && !isScalarAfterVectorization(I, VF); } @@ -1161,17 +1166,17 @@ public: /// Save vectorization decision \p W and \p Cost taken by the cost model for /// instruction \p I and vector width \p VF. - void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, + void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, unsigned Cost) { - assert(VF >= 2 && "Expected VF >=2"); + assert(VF.isVector() && "Expected VF >=2"); WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); } /// Save vectorization decision \p W and \p Cost taken by the cost model for /// interleaving group \p Grp and vector width \p VF. - void setWideningDecision(const InterleaveGroup *Grp, unsigned VF, - InstWidening W, unsigned Cost) { - assert(VF >= 2 && "Expected VF >=2"); + void setWideningDecision(const InterleaveGroup *Grp, + ElementCount VF, InstWidening W, unsigned Cost) { + assert(VF.isVector() && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. /// But the cost will be assigned to one instruction only. for (unsigned i = 0; i < Grp->getFactor(); ++i) { @@ -1187,15 +1192,16 @@ public: /// Return the cost model decision for the given instruction \p I and vector /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. - InstWidening getWideningDecision(Instruction *I, unsigned VF) { - assert(VF >= 2 && "Expected VF >=2"); + InstWidening getWideningDecision(Instruction *I, ElementCount VF) { + assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(VF.isVector() && "Expected VF >=2"); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) return CM_GatherScatter; - std::pair InstOnVF = std::make_pair(I, VF); + std::pair InstOnVF = std::make_pair(I, VF); auto Itr = WideningDecisions.find(InstOnVF); if (Itr == WideningDecisions.end()) return CM_Unknown; @@ -1204,9 +1210,9 @@ public: /// Return the vectorization cost for the given instruction \p I and vector /// width \p VF. - unsigned getWideningCost(Instruction *I, unsigned VF) { - assert(VF >= 2 && "Expected VF >=2"); - std::pair InstOnVF = std::make_pair(I, VF); + unsigned getWideningCost(Instruction *I, ElementCount VF) { + assert(VF.isVector() && "Expected VF >=2"); + std::pair InstOnVF = std::make_pair(I, VF); assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && "The cost is not calculated"); return WideningDecisions[InstOnVF].second; @@ -1215,7 +1221,7 @@ public: /// Return True if instruction \p I is an optimizable truncate whose operand /// is an induction variable. Such a truncate will be removed by adding a new /// induction variable with the destination type. - bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { + bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { // If the instruction is not a truncate, return false. auto *Trunc = dyn_cast(I); if (!Trunc) @@ -1240,14 +1246,14 @@ public: /// Collects the instructions to scalarize for each predicated instruction in /// the loop. - void collectInstsToScalarize(unsigned VF); + void collectInstsToScalarize(ElementCount VF); /// Collect Uniform and Scalar values for the given \p VF. /// The sets depend on CM decision for Load/Store instructions /// that may be vectorized as interleave, gather-scatter or scalarized. - void collectUniformsAndScalars(unsigned VF) { + void collectUniformsAndScalars(ElementCount VF) { // Do the analysis once. - if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) + if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) return; setCostBasedWideningDecision(VF); collectLoopUniforms(VF); @@ -1298,7 +1304,8 @@ public: /// instructions that may divide by zero. /// If a non-zero VF has been calculated, we check if I will be scalarized /// predication for that VF. - bool isScalarWithPredication(Instruction *I, unsigned VF = 1); + bool isScalarWithPredication(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. @@ -1315,12 +1322,16 @@ public: /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. - bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + bool + memoryInstructionCanBeWidened(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); /// Returns true if \p I is a memory instruction in an interleaved-group /// of memory accesses that can be vectorized with wide vector loads/stores /// and shuffles. - bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); + bool + interleavedAccessCanBeWidened(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { @@ -1372,14 +1383,15 @@ public: /// Estimate cost of an intrinsic call instruction CI if it were vectorized /// with factor VF. Return the cost of the instruction, including /// scalarization overhead if it's needed. - unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); + unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); /// Estimate cost of a call instruction CI if it were vectorized with factor /// VF. Return the cost of the instruction, including scalarization overhead /// if it's needed. The flag NeedToScalarize shows if the call needs to be /// scalarized - /// i.e. either vector version isn't available, or is too expensive. - unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); + unsigned getVectorCallCost(CallInst *CI, ElementCount VF, + bool &NeedToScalarize); /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { @@ -1409,41 +1421,41 @@ private: /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. - VectorizationCostTy expectedCost(unsigned VF); + VectorizationCostTy expectedCost(ElementCount VF); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. - unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); + unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); /// Calculate vectorization cost of memory instruction \p I. - unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); + unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); /// The cost computation for scalarized memory instruction. - unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); + unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); /// The cost computation for interleaving group of memory instructions. - unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); + unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); /// The cost computation for Gather/Scatter instruction. - unsigned getGatherScatterCost(Instruction *I, unsigned VF); + unsigned getGatherScatterCost(Instruction *I, ElementCount VF); /// The cost computation for widening instruction \p I with consecutive /// memory access. - unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); + unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); /// The cost calculation for Load/Store instruction \p I with uniform pointer - /// Load: scalar load + broadcast. /// Store: scalar store + (loop invariant value stored? 0 : extract of last /// element) - unsigned getUniformMemOpCost(Instruction *I, unsigned VF); + unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. - unsigned getScalarizationOverhead(Instruction *I, unsigned VF); + unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. @@ -1483,19 +1495,19 @@ private: /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated /// vectorization factor. The entries are VF-ScalarCostTy pairs. - DenseMap InstsToScalarize; + DenseMap InstsToScalarize; /// Holds the instructions known to be uniform after vectorization. /// The data is collected per VF. - DenseMap> Uniforms; + DenseMap> Uniforms; /// Holds the instructions known to be scalar after vectorization. /// The data is collected per VF. - DenseMap> Scalars; + DenseMap> Scalars; /// Holds the instructions (address computations) that are forced to be /// scalarized. - DenseMap> ForcedScalars; + DenseMap> ForcedScalars; /// PHINodes of the reductions that should be expanded in-loop along with /// their associated chains of reduction operations, in program order from top @@ -1508,7 +1520,7 @@ private: /// non-negative return value implies the expression will be scalarized. /// Currently, only single-use chains are considered for scalarization. int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, - unsigned VF); + ElementCount VF); /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in @@ -1519,27 +1531,28 @@ private: /// scalarized instruction will be represented by VF scalar values in the /// vectorized loop, each corresponding to an iteration of the original /// scalar loop. - void collectLoopUniforms(unsigned VF); + void collectLoopUniforms(ElementCount VF); /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized /// during vectorization. Non-uniform scalarized instructions will be /// represented by VF values in the vectorized loop, each corresponding to an /// iteration of the original scalar loop. - void collectLoopScalars(unsigned VF); + void collectLoopScalars(ElementCount VF); /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. - using DecisionList = DenseMap, + using DecisionList = DenseMap, std::pair>; DecisionList WideningDecisions; /// Returns true if \p V is expected to be vectorized and it needs to be /// extracted. - bool needsExtract(Value *V, unsigned VF) const { + bool needsExtract(Value *V, ElementCount VF) const { Instruction *I = dyn_cast(V); - if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) + if (VF.isScalar() || !I || !TheLoop->contains(I) || + TheLoop->isLoopInvariant(I)) return false; // Assume we can vectorize V (and hence we need extraction) if the @@ -1554,7 +1567,7 @@ private: /// Returns a range containing only operands needing to be extracted. SmallVector filterExtractingOperands(Instruction::op_range Ops, - unsigned VF) { + ElementCount VF) { return SmallVector(make_filter_range( Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } @@ -1801,7 +1814,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // Multiply the vectorization factor by the step using integer or // floating-point arithmetic as appropriate. - Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); + Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF.Min); Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); // Create a vector splat to use in the induction update. @@ -1809,9 +1822,9 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. + assert(!VF.Scalable && "scalable vectors not yet supported."); Value *SplatVF = isa(Mul) - ? ConstantVector::getSplat(ElementCount::getFixed(VF), - cast(Mul)) + ? ConstantVector::getSplat(VF, cast(Mul)) : Builder.CreateVectorSplat(VF, Mul); Builder.restoreIP(CurrIP); @@ -1946,8 +1959,9 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { - Value *EntryPart = - getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); + assert(!VF.Scalable && "scalable vectors not yet supported."); + Value *EntryPart = getStepVector(Broadcasted, VF.Min * Part, Step, + ID.getInductionOpcode()); VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); if (Trunc) addMetadata(EntryPart, Trunc); @@ -1957,7 +1971,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); - if (VF <= 1) { + if (VF.isZero() || VF.isScalar()) { Value *ScalarIV = CreateScalarIV(Step); CreateSplatIV(ScalarIV, Step); return; @@ -2055,8 +2069,9 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. - assert(VF > 1 && "VF should be greater than one"); - + assert(VF.isVector() && "VF should be greater than one"); + assert(!VF.Scalable && + "the code below assumes a fixed number of elements at compile time"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); assert(ScalarIVTy == Step->getType() && @@ -2078,12 +2093,14 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 - : VF; + Cost->isUniformAfterVectorization(cast(EntryVal), VF) + ? 1 + : VF.Min; // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); + auto *StartIdx = + getSignedIntOrFpConstant(ScalarIVTy, VF.Min * Part + Lane); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); @@ -2126,7 +2143,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // is known to be uniform after vectorization, this corresponds to lane zero // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. - unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; + assert(!VF.Scalable && "scalable vectors not yet supported."); + unsigned LastLane = + Cost->isUniformAfterVectorization(I, VF) ? 0 : VF.Min - 1; auto *LastInst = cast( VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); @@ -2148,9 +2167,10 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { // Initialize packing with insertelements to start from undef. - Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF)); + assert(!VF.Scalable && "VF is assumed to be non scalable."); + Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); VectorLoopValueMap.setVectorValue(V, Part, Undef); - for (unsigned Lane = 0; Lane < VF; ++Lane) + for (unsigned Lane = 0; Lane < VF.Min; ++Lane) packScalarIntoVectorValue(V, {Part, Lane}); VectorValue = VectorLoopValueMap.getVectorValue(V, Part); } @@ -2214,9 +2234,10 @@ void InnerLoopVectorizer::packScalarIntoVectorValue( Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); + assert(!VF.Scalable && "Cannot reverse scalable vectors"); SmallVector ShuffleMask; - for (unsigned i = 0; i < VF; ++i) - ShuffleMask.push_back(VF - i - 1); + for (unsigned i = 0; i < VF.Min; ++i) + ShuffleMask.push_back(VF.Min - i - 1); return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), ShuffleMask, "reverse"); @@ -2270,7 +2291,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); - auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF); + assert(!VF.Scalable && "scalable vectors not yet supported."); + auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); // Prepare for the new pointers. SmallVector AddrParts; @@ -2286,8 +2308,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. + assert(!VF.Scalable && + "scalable vector reverse operation is not implemented"); if (Group->isReverse()) - Index += (VF - 1) * Group->getFactor(); + Index += (VF.Min - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, {Part, 0}); @@ -2322,7 +2346,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *MaskForGaps = nullptr; if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { - MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); + assert(!VF.Scalable && "scalable vectors not yet supported."); + MaskForGaps = createBitMaskForGaps(Builder, VF.Min, *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); } @@ -2339,9 +2364,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); + assert(!VF.Scalable && "scalable vectors not yet supported."); Value *ShuffledMask = Builder.CreateShuffleVector( BlockInMaskPart, Undefs, - createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); + createReplicatedMask(InterleaveFactor, VF.Min), + "interleaved.mask"); GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) @@ -2367,14 +2394,16 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (!Member) continue; - auto StrideMask = createStrideMask(I, InterleaveFactor, VF); + assert(!VF.Scalable && "scalable vectors not yet supported."); + auto StrideMask = createStrideMask(I, InterleaveFactor, VF.Min); for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( NewLoads[Part], UndefVec, StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF); + assert(!VF.Scalable && "VF is assumed to be non scalable."); + VectorType *OtherVTy = VectorType::get(Member->getType(), VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } @@ -2388,7 +2417,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } // The sub vector type for current instruction. - auto *SubVT = FixedVectorType::get(ScalarTy, VF); + assert(!VF.Scalable && "VF is assumed to be non scalable."); + auto *SubVT = VectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. for (unsigned Part = 0; Part < UF; Part++) { @@ -2416,8 +2446,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *WideVec = concatenateVectors(Builder, StoredVecs); // Interleave the elements in the wide vector. + assert(!VF.Scalable && "scalable vectors not yet supported."); Value *IVec = Builder.CreateShuffleVector( - WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), + WideVec, UndefVec, createInterleaveMask(VF.Min, InterleaveFactor), "interleaved.vec"); Instruction *NewStoreInstr; @@ -2425,8 +2456,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *BlockInMaskPart = State.get(BlockInMask, Part); auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), - "interleaved.mask"); + BlockInMaskPart, Undefs, + createReplicatedMask(InterleaveFactor, VF.Min), "interleaved.mask"); NewStoreInstr = Builder.CreateMaskedStore( IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); } @@ -2459,7 +2490,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, "CM decision is not to widen the memory instruction"); Type *ScalarDataTy = getMemInstValueType(Instr); - auto *DataTy = FixedVectorType::get(ScalarDataTy, VF); + + assert(!VF.Scalable && "scalable vectors not yet supported."); + auto *DataTy = VectorType::get(ScalarDataTy, VF); const Align Alignment = getLoadStoreAlignment(Instr); // Determine if the pointer operand of the access is either consecutive or @@ -2493,17 +2526,17 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (Reverse) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. - PartPtr = cast( - Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); + PartPtr = cast(Builder.CreateGEP( + ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.Min))); PartPtr->setIsInBounds(InBounds); - PartPtr = cast( - Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); + PartPtr = cast(Builder.CreateGEP( + ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.Min))); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { - PartPtr = cast( - Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); + PartPtr = cast(Builder.CreateGEP( + ScalarDataTy, Ptr, Builder.getInt32(Part * VF.Min))); PartPtr->setIsInBounds(InBounds); } @@ -2699,7 +2732,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Type *Ty = TC->getType(); - Constant *Step = ConstantInt::get(Ty, VF * UF); + // This is where we can make the step a runtime constant. + assert(!VF.Scalable && "scalable vectorization is not supported yet"); + Constant *Step = ConstantInt::get(Ty, VF.Min * UF); // If the tail is to be folded by masking, round the number of iterations N // up to a multiple of Step instead of rounding down. This is done by first @@ -2708,9 +2743,10 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // that it starts at zero and its Step is a power of two; the loop will then // exit, with the last early-exit vector comparison also producing all-true. if (Cost->foldTailByMasking()) { - assert(isPowerOf2_32(VF * UF) && + assert(isPowerOf2_32(VF.Min * UF) && "VF*UF must be a power of 2 when folding tail by masking"); - TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); + TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF.Min * UF - 1), + "n.rnd.up"); } // Now we need to generate the expression for the part of the loop that the @@ -2727,7 +2763,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // does not evenly divide the trip count, no adjustment is necessary since // there will already be scalar iterations. Note that the minimum iterations // check ensures that N >= Step. - if (VF > 1 && Cost->requiresScalarEpilogue()) { + if (VF.isVector() && Cost->requiresScalarEpilogue()) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -2740,6 +2776,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL) { // Verify that V is a vector type with same number of elements as DstVTy. + assert(isa(DstVTy) && + "Vector type is assumed to be fixed width."); unsigned VF = DstVTy->getNumElements(); VectorType *SrcVecTy = cast(V->getType()); assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); @@ -2785,11 +2823,12 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); - if (!Cost->foldTailByMasking()) + if (!Cost->foldTailByMasking()) { + assert(!VF.Scalable && "scalable vectors not yet supported."); CheckMinIters = Builder.CreateICmp( - P, Count, ConstantInt::get(Count->getType(), VF * UF), + P, Count, ConstantInt::get(Count->getType(), VF.Min * UF), "min.iters.check"); - + } // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, @@ -3242,7 +3281,8 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { Value *StartIdx = ConstantInt::get(IdxTy, 0); // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). - Constant *Step = ConstantInt::get(IdxTy, VF * UF); + assert(!VF.Scalable && "scalable vectors not yet supported."); + Constant *Step = ConstantInt::get(IdxTy, VF.Min * UF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); Induction = createInductionVariable(Lp, StartIdx, CountRoundDown, Step, @@ -3374,8 +3414,9 @@ static void cse(BasicBlock *BB) { } unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, - unsigned VF, + ElementCount VF, bool &NeedToScalarize) { + assert(!VF.Scalable && "scalable vectors not yet supported."); Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector Tys, ScalarTys; @@ -3388,7 +3429,7 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // value. unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); - if (VF == 1) + if (VF.isScalar()) return ScalarCallCost; // Compute corresponding vector type for return value and arguments. @@ -3400,13 +3441,12 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // packing the return values to a vector. unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); - unsigned Cost = ScalarCallCost * VF + ScalarizationCost; + unsigned Cost = ScalarCallCost * VF.Min + ScalarizationCost; // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. NeedToScalarize = true; - VFShape Shape = - VFShape::get(*CI, ElementCount::getFixed(VF), false /*HasGlobalPred*/); + VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); if (!TLI || CI->isNoBuiltin() || !VecFunc) @@ -3423,7 +3463,7 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, } unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, - unsigned VF) { + ElementCount VF) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); @@ -3580,7 +3620,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { void InnerLoopVectorizer::fixVectorizedLoop() { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. - if (VF > 1) + if (VF.isVector()) truncateToMinimalBitwidths(); // Fix widened non-induction PHIs by setting up the PHI operands. @@ -3621,9 +3661,11 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // profile is not inherently precise anyway. Note also possible bypass of // vector code caused by legality checks is ignored, assigning all the weight // to the vector loop, optimistically. + assert(!VF.Scalable && + "cannot use scalable ElementCount to determine unroll factor"); setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), - LI->getLoopFor(LoopScalarBody), VF * UF); + LI->getLoopFor(LoopScalarBody), VF.Min * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs() { @@ -3702,11 +3744,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Create a vector from the initial value. auto *VectorInit = ScalarInit; - if (VF > 1) { + if (VF.isVector()) { Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + assert(!VF.Scalable && "VF is assumed to be non scalable."); VectorInit = Builder.CreateInsertElement( - UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)), - VectorInit, Builder.getInt32(VF - 1), "vector.recur.init"); + UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, + Builder.getInt32(VF.Min - 1), "vector.recur.init"); } // We constructed a temporary phi node in the first phase of vectorization. @@ -3747,10 +3790,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. - SmallVector ShuffleMask(VF); - ShuffleMask[0] = VF - 1; - for (unsigned I = 1; I < VF; ++I) - ShuffleMask[I] = I + VF - 1; + assert(!VF.Scalable); + SmallVector ShuffleMask(VF.Min); + ShuffleMask[0] = VF.Min - 1; + for (unsigned I = 1; I < VF.Min; ++I) + ShuffleMask[I] = I + VF.Min - 1; // The vector from which to take the initial value for the current iteration // (actual or unrolled). Initially, this is the vector phi node. @@ -3760,9 +3804,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { for (unsigned Part = 0; Part < UF; ++Part) { Value *PreviousPart = getOrCreateVectorValue(Previous, Part); Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); - auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, - ShuffleMask) - : Incoming; + auto *Shuffle = + VF.isVector() + ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) + : Incoming; PhiPart->replaceAllUsesWith(Shuffle); cast(PhiPart)->eraseFromParent(); VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); @@ -3775,10 +3820,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Extract the last vector element in the middle block. This will be the // initial value for the recurrence when jumping to the scalar loop. auto *ExtractForScalar = Incoming; - if (VF > 1) { + if (VF.isVector()) { Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); ExtractForScalar = Builder.CreateExtractElement( - ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); + ExtractForScalar, Builder.getInt32(VF.Min - 1), "vector.recur.extract"); } // Extract the second last element in the middle block if the // Phi is used outside the loop. We need to extract the phi itself @@ -3786,9 +3831,9 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // will be the value when jumping to the exit block from the LoopMiddleBlock, // when the scalar loop is not run at all. Value *ExtractForPhiUsedOutsideLoop = nullptr; - if (VF > 1) + if (VF.isVector()) ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( - Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); + Incoming, Builder.getInt32(VF.Min - 2), "vector.recur.extract.for.phi"); // When loop is unrolled without vectorizing, initialize // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of // `Incoming`. This is analogous to the vectorized case above: extracting the @@ -3867,7 +3912,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // incoming scalar reduction. VectorStart = ReductionStartValue; } else { - Identity = ConstantVector::getSplat(ElementCount::getFixed(VF), Iden); + Identity = ConstantVector::getSplat(VF, Iden); // This vector is the Identity vector where the first element is the // incoming scalar reduction. @@ -3943,9 +3988,10 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. - if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { + if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); - Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF); + assert(!VF.Scalable && "scalable vectors not yet supported."); + Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); Builder.SetInsertPoint( LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); VectorParts RdxParts(UF); @@ -3997,7 +4043,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // Create the reduction after the loop. Note that inloop reductions create the // target reduction in the loop using a Reduction recipe. - if (VF > 1 && !IsInLoopReductionPhi) { + if (VF.isVector() && !IsInLoopReductionPhi) { bool NoNaN = Legal->hasFunNoNaNAttr(); ReducedPartRdx = createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); @@ -4076,16 +4122,17 @@ void InnerLoopVectorizer::clearReductionWrapFlags( } void InnerLoopVectorizer::fixLCSSAPHIs() { + assert(!VF.Scalable && "the code below assumes fixed width vectors"); for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getNumIncomingValues() == 1) { auto *IncomingValue = LCSSAPhi.getIncomingValue(0); // Non-instruction incoming values will have only one value. unsigned LastLane = 0; - if (isa(IncomingValue)) - LastLane = Cost->isUniformAfterVectorization( - cast(IncomingValue), VF) - ? 0 - : VF - 1; + if (isa(IncomingValue)) + LastLane = Cost->isUniformAfterVectorization( + cast(IncomingValue), VF) + ? 0 + : VF.Min - 1; // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); @@ -4197,7 +4244,7 @@ void InnerLoopVectorizer::fixNonInductionPHIs() { } void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, - unsigned UF, unsigned VF, + unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State) { @@ -4207,7 +4254,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, // is vector-typed. Thus, to keep the representation compact, we only use // vector-typed operands for loop-varying values. - if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { // If we are vectorizing, but the GEP has only loop-invariant operands, // the GEP we build (by only using vector-typed operands for // loop-varying values) would be a scalar pointer. Thus, to ensure we @@ -4267,7 +4314,8 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, } void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, - unsigned VF) { + ElementCount VF) { + assert(!VF.Scalable && "scalable vectors not yet supported."); PHINode *P = cast(PN); if (EnableVPlanNativePath) { // Currently we enter here in the VPlan-native path for non-induction @@ -4275,7 +4323,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Create a vector phi with no operands - the vector phi operands will be // set at the end of vector code generation. Type *VecTy = - (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); + (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); VectorLoopValueMap.setVectorValue(P, 0, VecPhi); OrigPHIsToFix.push_back(P); @@ -4293,9 +4341,10 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { for (unsigned Part = 0; Part < UF; ++Part) { // This is phase one of vectorizing PHIs. - bool ScalarPHI = (VF == 1) || Cost->isInLoopReduction(cast(PN)); + bool ScalarPHI = + (VF.isScalar()) || Cost->isInLoopReduction(cast(PN)); Type *VecTy = - ScalarPHI ? PN->getType() : FixedVectorType::get(PN->getType(), VF); + ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); Value *EntryPart = PHINode::Create( VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); VectorLoopValueMap.setVectorValue(P, Part, EntryPart); @@ -4331,10 +4380,11 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; + unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.Min; for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); + Constant *Idx = + ConstantInt::get(PtrInd->getType(), Lane + Part * VF.Min); Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); @@ -4364,7 +4414,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); Value *InductionGEP = GetElementPtrInst::Create( ScStValueType->getPointerElementType(), NewPointerPhi, - Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)), + Builder.CreateMul(ScalarStepValue, + ConstantInt::get(PhiType, VF.Min * UF)), "ptr.ind", InductionLoc); NewPointerPhi->addIncoming(InductionGEP, LoopLatch); @@ -4374,14 +4425,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, for (unsigned Part = 0; Part < UF; ++Part) { SmallVector Indices; // Create a vector of consecutive numbers from zero to VF. - for (unsigned i = 0; i < VF; ++i) - Indices.push_back(ConstantInt::get(PhiType, i + Part * VF)); + for (unsigned i = 0; i < VF.Min; ++i) + Indices.push_back(ConstantInt::get(PhiType, i + Part * VF.Min)); Constant *StartOffset = ConstantVector::get(Indices); Value *GEP = Builder.CreateGEP( ScStValueType->getPointerElementType(), NewPointerPhi, Builder.CreateMul(StartOffset, - Builder.CreateVectorSplat(VF, ScalarStepValue), + Builder.CreateVectorSplat(VF.Min, ScalarStepValue), "vector.gep")); VectorLoopValueMap.setVectorValue(P, Part, GEP); } @@ -4409,6 +4460,7 @@ static bool mayDivideByZero(Instruction &I) { void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, VPTransformState &State) { + assert(!VF.Scalable && "scalable vectors not yet supported."); switch (I.getOpcode()) { case Instruction::Call: case Instruction::Br: @@ -4496,8 +4548,9 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, setDebugLocFromInst(Builder, CI); /// Vectorize casts. + assert(!VF.Scalable && "VF is assumed to be non scalable."); Type *DestTy = - (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF); + (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); for (unsigned Part = 0; Part < UF; ++Part) { Value *A = State.get(User.getOperand(0), Part); @@ -4525,7 +4578,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, SmallVector Tys; for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.Min)); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); @@ -4556,15 +4609,15 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, if (UseVectorIntrinsic) { // Use vector version of the intrinsic. Type *TysForDecl[] = {CI->getType()}; - if (VF > 1) - TysForDecl[0] = - FixedVectorType::get(CI->getType()->getScalarType(), VF); + if (VF.isVector()) { + assert(!VF.Scalable && "VF is assumed to be non scalable."); + TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); + } VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic."); } else { // Use vector version of the function call. - const VFShape Shape = VFShape::get(*CI, ElementCount::getFixed(VF), - false /*HasGlobalPred*/); + const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); #ifndef NDEBUG assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && "Can't create vector function."); @@ -4607,11 +4660,11 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, } } -void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { +void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does // this check. Collecting Scalars for VF=1 does not make any sense. - assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && + assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && "This function should not be visited twice for the same VF"); SmallSetVector Worklist; @@ -4794,7 +4847,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { +bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, + ElementCount VF) { + assert(!VF.Scalable && "scalable vectors not yet supported."); if (!blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { @@ -4808,7 +4863,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne auto *Ty = getMemInstValueType(I); // We have already decided how to vectorize this instruction, get that // result. - if (VF > 1) { + if (VF.isVector()) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); @@ -4829,8 +4884,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne return false; } -bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, - unsigned VF) { +bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( + Instruction *I, ElementCount VF) { assert(isAccessInterleaved(I) && "Expecting interleaved access."); assert(getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."); @@ -4866,8 +4921,8 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, : TTI.isLegalMaskedStore(Ty, Alignment); } -bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, - unsigned VF) { +bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( + Instruction *I, ElementCount VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast(I); StoreInst *SI = dyn_cast(I); @@ -4894,13 +4949,13 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, return true; } -void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { +void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which // already does this check. Collecting Uniforms for VF=1 does not make any // sense. - assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && + assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"); // Visit the list of Uniforms. If we'll not find any uniform value, we'll @@ -4951,7 +5006,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // Holds pointer operands of instructions that are possibly non-uniform. SmallPtrSet PossibleNonUniformPtrs; - auto isUniformDecision = [&](Instruction *I, unsigned VF) { + auto isUniformDecision = [&](Instruction *I, ElementCount VF) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); @@ -5248,10 +5303,10 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { (MaximizeBandwidth && isScalarEpilogueAllowed())) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). - SmallVector VFs; + SmallVector VFs; unsigned NewMaxVectorSize = WidestRegister / SmallestType; for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) - VFs.push_back(VS); + VFs.push_back(ElementCount::getFixed(VS)); // For each VF calculate its register usage. auto RUs = calculateRegisterUsage(VFs); @@ -5266,7 +5321,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { Selected = false; } if (Selected) { - MaxVF = VFs[i]; + MaxVF = VFs[i].Min; break; } } @@ -5283,7 +5338,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { - float Cost = expectedCost(1).first; + float Cost = expectedCost(ElementCount::getFixed(1)).first; const float ScalarCost = Cost; unsigned Width = 1; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); @@ -5300,7 +5355,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. - VectorizationCostTy C = expectedCost(i); + VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); float VectorCost = C.first / (float)i; LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); @@ -5328,7 +5383,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); - VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; + VectorizationFactor Factor = {ElementCount::getFixed(Width), + (unsigned)(Width * Cost)}; return Factor; } @@ -5388,7 +5444,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { return {MinWidth, MaxWidth}; } -unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, +unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, unsigned LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -5466,7 +5522,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, } // Clamp the interleave ranges to reasonable counts. - unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); + assert(!VF.Scalable && "scalable vectors not yet supported."); + unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.Min); // Check if the user has overridden the max. if (VF == 1) { @@ -5480,7 +5537,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // If trip count is known or estimated compile time constant, limit the // interleave count to be less than the trip count divided by VF. if (BestKnownTC) { - MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); + MaxInterleaveCount = std::min(*BestKnownTC / VF.Min, MaxInterleaveCount); } // If we did not calculate the cost for VF (because the user selected the VF) @@ -5499,7 +5556,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF > 1 && !Legal->getReductionVars().empty()) { + if (VF.isVector() && !Legal->getReductionVars().empty()) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -5507,7 +5564,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // Note that if we've already vectorized the loop we will have done the // runtime check and so interleaving won't require further checks. bool InterleavingRequiresRuntimePointerCheck = - (VF == 1 && Legal->getRuntimePointerChecking()->Need); + (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. @@ -5561,7 +5618,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, } SmallVector -LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { +LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -5648,11 +5705,12 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); // A lambda that gets the register usage for the given type and VF. - auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { + auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) { if (Ty->isTokenTy()) return 0U; unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); - return std::max(1, VF * TypeSize / WidestRegister); + assert(!VF.Scalable && "scalable vectors not yet supported."); + return std::max(1, VF.Min * TypeSize / WidestRegister); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -5676,7 +5734,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // Count the number of live intervals. SmallMapVector RegUsage; - if (VFs[j] == 1) { + if (VFs[j].isScalar()) { for (auto Inst : OpenIntervals) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); if (RegUsage.find(ClassID) == RegUsage.end()) @@ -5725,8 +5783,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { SmallMapVector Invariant; for (auto Inst : LoopInvariants) { - unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); - unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); + unsigned Usage = + VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + unsigned ClassID = + TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); if (Invariant.find(ClassID) == Invariant.end()) Invariant[ClassID] = Usage; else @@ -5774,12 +5834,13 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ NumPredStores > NumberOfStoresToPredicate); } -void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { +void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { // If we aren't vectorizing the loop, or if we've already collected the // instructions to scalarize, there's nothing to do. Collection may already // have occurred if we have a user-selected VF and are now computing the // expected cost for interleaving. - if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) + if (VF.isScalar() || VF.isZero() || + InstsToScalarize.find(VF) != InstsToScalarize.end()) return; // Initialize a mapping for VF in InstsToScalalarize. If we find that it's @@ -5809,7 +5870,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { int LoopVectorizationCostModel::computePredInstDiscount( Instruction *PredInst, DenseMap &ScalarCosts, - unsigned VF) { + ElementCount VF) { assert(!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"); @@ -5876,16 +5937,20 @@ int LoopVectorizationCostModel::computePredInstDiscount( // the instruction as if it wasn't if-converted and instead remained in the // predicated block. We will scale this cost by block probability after // computing the scalarization overhead. - unsigned ScalarCost = VF * getInstructionCost(I, 1).first; + assert(!VF.Scalable && "scalable vectors not yet supported."); + unsigned ScalarCost = + VF.Min * getInstructionCost(I, ElementCount::getFixed(1)).first; // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), - APInt::getAllOnesValue(VF), true, false); - ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI, - TTI::TCK_RecipThroughput); + APInt::getAllOnesValue(VF.Min), true, false); + assert(!VF.Scalable && "scalable vectors not yet supported."); + ScalarCost += + VF.Min * + TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); } // Compute the scalarization overhead of needed extractelement @@ -5898,10 +5963,12 @@ int LoopVectorizationCostModel::computePredInstDiscount( "Instruction has non-scalar type"); if (canBeScalarized(J)) Worklist.push_back(J); - else if (needsExtract(J, VF)) + else if (needsExtract(J, VF)) { + assert(!VF.Scalable && "scalable vectors not yet supported."); ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(J->getType(), VF)), - APInt::getAllOnesValue(VF), false, true); + APInt::getAllOnesValue(VF.Min), false, true); + } } // Scale the total scalar cost by block probability. @@ -5917,7 +5984,8 @@ int LoopVectorizationCostModel::computePredInstDiscount( } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::expectedCost(unsigned VF) { +LoopVectorizationCostModel::expectedCost(ElementCount VF) { + assert(!VF.Scalable && "scalable vectors not yet supported."); VectorizationCostTy Cost; // For each block. @@ -5927,7 +5995,8 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { // For each instruction in the old loop. for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. - if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) + if (ValuesToIgnore.count(&I) || + (VF.isVector() && VecValuesToIgnore.count(&I))) continue; VectorizationCostTy C = getInstructionCost(&I, VF); @@ -5949,7 +6018,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { // unconditionally executed. For the scalar case, we may not always execute // the predicated block. Thus, scale the block's cost by the probability of // executing it. - if (VF == 1 && blockNeedsPredication(BB)) + if (VF.isScalar() && blockNeedsPredication(BB)) BlockCost.first /= getReciprocalPredBlockProb(); Cost.first += BlockCost.first; @@ -5994,9 +6063,12 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { Legal->hasStride(I->getOperand(1)); } -unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, - unsigned VF) { - assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); +unsigned +LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, + ElementCount VF) { + assert(VF.isVector() && + "Scalarization cost of instruction implies vectorization."); + assert(!VF.Scalable && "scalable vectors not yet supported."); Type *ValTy = getMemInstValueType(I); auto SE = PSE.getSE(); @@ -6009,14 +6081,14 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); // Get the cost of the scalar memory instruction and address computation. - unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + unsigned Cost = VF.Min * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. const Align Alignment = getLoadStoreAlignment(I); - Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment, AS, - TTI::TCK_RecipThroughput); + Cost += VF.Min * + TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, + AS, TTI::TCK_RecipThroughput); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -6038,7 +6110,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, } unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, - unsigned VF) { + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); @@ -6064,7 +6136,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, } unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, - unsigned VF) { + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -6082,14 +6154,13 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) + - (isLoopInvariantStoreValue - ? 0 - : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - VF - 1)); + (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( + Instruction::ExtractElement, + VectorTy, VF.Min - 1)); } unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, - unsigned VF) { + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -6102,7 +6173,7 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, } unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, - unsigned VF) { + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); @@ -6111,7 +6182,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); - auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor); + assert(!VF.Scalable && "scalable vectors not yet supported."); + auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); // Holds the indices of existing members in an interleaved load group. // An interleaved store group doesn't need this as it doesn't allow gaps. @@ -6140,10 +6212,10 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, } unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, - unsigned VF) { + ElementCount VF) { // Calculate scalar cost only. Vectorization cost should be ready at this // moment. - if (VF == 1) { + if (VF.isScalar()) { Type *ValTy = getMemInstValueType(I); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); @@ -6156,35 +6228,42 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { +LoopVectorizationCostModel::getInstructionCost(Instruction *I, + ElementCount VF) { + assert(!VF.Scalable && + "the cost model is not yet implemented for scalable vectorization"); // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (isUniformAfterVectorization(I, VF)) - VF = 1; + VF = ElementCount::getFixed(1); - if (VF > 1 && isProfitableToScalarize(I, VF)) + if (VF.isVector() && isProfitableToScalarize(I, VF)) return VectorizationCostTy(InstsToScalarize[VF][I], false); // Forced scalars do not have any scalarization overhead. auto ForcedScalar = ForcedScalars.find(VF); - if (VF > 1 && ForcedScalar != ForcedScalars.end()) { + if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { auto InstSet = ForcedScalar->second; if (InstSet.count(I)) - return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); + return VectorizationCostTy( + (getInstructionCost(I, ElementCount::getFixed(1)).first * VF.Min), + false); } Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); - bool TypeNotScalarized = - VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; + bool TypeNotScalarized = VF.isVector() && VectorTy->isVectorTy() && + TTI.getNumberOfParts(VectorTy) < VF.Min; return VectorizationCostTy(C, TypeNotScalarized); } unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, - unsigned VF) { + ElementCount VF) { - if (VF == 1) + assert(!VF.Scalable && + "cannot compute scalarization overhead for scalable vectorization"); + if (VF.isScalar()) return 0; unsigned Cost = 0; @@ -6192,7 +6271,7 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, if (!RetTy->isVoidTy() && (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( - cast(RetTy), APInt::getAllOnesValue(VF), true, false); + cast(RetTy), APInt::getAllOnesValue(VF.Min), true, false); // Some targets keep addresses scalar. if (isa(I) && !TTI.prefersVectorizedAddressing()) @@ -6208,12 +6287,14 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, // Skip operands that do not require extraction/scalarization and do not incur // any overhead. - return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), VF); + return Cost + + TTI.getOperandsScalarizationOverhead(filterExtractingOperands(Ops, VF), + VF.Min); } -void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { - if (VF == 1) +void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { + assert(!VF.Scalable && "scalable vectors not yet supported."); + if (VF.isScalar()) return; NumPredStores = 0; for (BasicBlock *BB : TheLoop->blocks()) { @@ -6347,14 +6428,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { InstWidening Decision = getWideningDecision(I, VF); if (Decision == CM_Widen || Decision == CM_Widen_Reverse) // Scalarize a widened load of address. - setWideningDecision(I, VF, CM_Scalarize, - (VF * getMemoryInstructionCost(I, 1))); + setWideningDecision( + I, VF, CM_Scalarize, + (VF.Min * getMemoryInstructionCost(I, ElementCount::getFixed(1)))); else if (auto Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { if (Instruction *Member = Group->getMember(I)) - setWideningDecision(Member, VF, CM_Scalarize, - (VF * getMemoryInstructionCost(Member, 1))); + setWideningDecision( + Member, VF, CM_Scalarize, + (VF.Min * + getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); } } } else @@ -6365,7 +6449,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { } unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, - unsigned VF, + ElementCount VF, Type *&VectorTy) { Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) @@ -6388,19 +6472,20 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // blocks requires also an extract of its vector compare i1 element. bool ScalarPredicatedBB = false; BranchInst *BI = cast(I); - if (VF > 1 && BI->isConditional() && + if (VF.isVector() && BI->isConditional() && (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { // Return cost for branches around scalarized and predicated blocks. + assert(!VF.Scalable && "scalable vectors not yet supported."); auto *Vec_i1Ty = - FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); - return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), - false, true) + - (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF)); - } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) + VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); + return (TTI.getScalarizationOverhead( + Vec_i1Ty, APInt::getAllOnesValue(VF.Min), false, true) + + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.Min)); + } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. return TTI.getCFInstrCost(Instruction::Br, CostKind); else @@ -6415,15 +6500,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // First-order recurrences are replaced by vector shuffles inside the loop. // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. - if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) + if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - cast(VectorTy), VF - 1, + cast(VectorTy), VF.Min - 1, FixedVectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi // node, where N is the number of incoming values. - if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) + if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) return (Phi->getNumIncomingValues() - 1) * TTI.getCmpSelInstrCost( Instruction::Select, ToVectorTy(Phi->getType(), VF), @@ -6440,17 +6525,18 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF > 1 && isScalarWithPredication(I)) { + if (VF.isVector() && isScalarWithPredication(I)) { unsigned Cost = 0; // These instructions have a non-void type, so account for the phi nodes // that we will create. This cost is likely to be zero. The phi node // cost, if any, should be scaled by the block probability because it // models a copy at the end of each predicated block. - Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind); + Cost += VF.Min * TTI.getCFInstrCost(Instruction::PHI, CostKind); // The cost of the non-predicated instruction. - Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); + Cost += + VF.Min * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); // The cost of insertelement and extractelement instructions needed for // scalarization. @@ -6489,14 +6575,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Op2VK = TargetTransformInfo::OK_UniformValue; SmallVector Operands(I->operand_values()); - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + assert(!VF.Scalable && "VF is assumed to be non scalable."); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, @@ -6509,9 +6596,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); - if (!ScalarCond) - CondTy = FixedVectorType::get(CondTy, VF); - + if (!ScalarCond) { + assert(!VF.Scalable && "VF is assumed to be non scalable."); + CondTy = VectorType::get(CondTy, VF); + } return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, CostKind, I); } @@ -6527,13 +6615,13 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } case Instruction::Store: case Instruction::Load: { - unsigned Width = VF; - if (Width > 1) { + ElementCount Width = VF; + if (Width.isVector()) { InstWidening Decision = getWideningDecision(I, Width); assert(Decision != CM_Unknown && "CM decision should be taken at this point"); if (Decision == CM_Scalarize) - Width = 1; + Width = ElementCount::getFixed(1); } VectorTy = ToVectorTy(getMemInstValueType(I), Width); return getMemoryInstructionCost(I, VF); @@ -6555,7 +6643,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, assert((isa(I) || isa(I)) && "Expected a load or a store!"); - if (VF == 1 || !TheLoop->contains(I)) + if (VF.isScalar() || !TheLoop->contains(I)) return TTI::CastContextHint::Normal; switch (getWideningDecision(I, VF)) { @@ -6621,7 +6709,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } } - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + assert(!VF.Scalable && "VF is assumed to be non scalable"); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; return N * TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } @@ -6636,8 +6725,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, - CostKind) + + return VF.Min * + TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, + CostKind) + getScalarizationOverhead(I, VF); } // end of switch. } @@ -6743,8 +6833,9 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, } VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { - unsigned VF = UserVF; +LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { + assert(!UserVF.Scalable && "scalable vectors not yet supported"); + ElementCount VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in @@ -6752,28 +6843,29 @@ LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { if (!OrigLoop->empty()) { // If the user doesn't provide a vectorization factor, determine a // reasonable one. - if (!UserVF) { - VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); + if (UserVF.isZero()) { + VF = ElementCount::getFixed( + determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); // Make sure we have a VF > 1 for stress testing. - if (VPlanBuildStressTest && VF < 2) { + if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " << "overriding computed VF.\n"); - VF = 4; + VF = ElementCount::getFixed(4); } } assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); - LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF - << " to build VPlans.\n"); - buildVPlans(VF, VF); + assert(isPowerOf2_32(VF.Min) && "VF needs to be a power of two"); + LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") + << "VF " << VF << " to build VPlans.\n"); + buildVPlans(VF.Min, VF.Min); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0}; + return {VF, 0 /*Cost*/}; } LLVM_DEBUG( @@ -6782,10 +6874,11 @@ LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { return VectorizationFactor::Disabled(); } -Optional LoopVectorizationPlanner::plan(unsigned UserVF, - unsigned UserIC) { +Optional +LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { + assert(!UserVF.Scalable && "scalable vectorization not yet handled"); assert(OrigLoop->empty() && "Inner loop expected."); - Optional MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); + Optional MaybeMaxVF = CM.computeMaxVF(UserVF.Min, UserIC); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -6803,14 +6896,14 @@ Optional LoopVectorizationPlanner::plan(unsigned UserVF, CM.invalidateCostModelingDecisions(); } - if (UserVF) { + if (!UserVF.isZero()) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); + assert(isPowerOf2_32(UserVF.Min) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF, UserVF); + buildVPlansWithVPRecipes(UserVF.Min, UserVF.Min); LLVM_DEBUG(printPlans(dbgs())); return {{UserVF, 0}}; } @@ -6820,12 +6913,12 @@ Optional LoopVectorizationPlanner::plan(unsigned UserVF, for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { // Collect Uniform and Scalar instructions after vectorization with VF. - CM.collectUniformsAndScalars(VF); + CM.collectUniformsAndScalars(ElementCount::getFixed(VF)); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. if (VF > 1) - CM.collectInstsToScalarize(VF); + CM.collectInstsToScalarize(ElementCount::getFixed(VF)); } CM.collectInLoopReductions(); @@ -6839,7 +6932,7 @@ Optional LoopVectorizationPlanner::plan(unsigned UserVF, return CM.selectVectorizationFactor(MaxVF); } -void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { +void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); BestVF = VF; @@ -6858,9 +6951,11 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, // 1. Create a new empty loop. Unlink the old loop and connect the new one. VPCallbackILV CallbackILV(ILV); - VPTransformState State{BestVF, BestUF, LI, - DT, ILV.Builder, ILV.VectorLoopValueMap, - &ILV, CallbackILV}; + assert(BestVF.hasValue() && "Vectorization Factor is missing"); + + VPTransformState State{*BestVF, BestUF, LI, + DT, ILV.Builder, ILV.VectorLoopValueMap, + &ILV, CallbackILV}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; @@ -6974,12 +7069,12 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } bool LoopVectorizationPlanner::getDecisionAndClampRange( - const std::function &Predicate, VFRange &Range) { + const std::function &Predicate, VFRange &Range) { assert(Range.End > Range.Start && "Trying to test an empty VF range."); - bool PredicateAtRangeStart = Predicate(Range.Start); + bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start)); for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) - if (Predicate(TmpVF) != PredicateAtRangeStart) { + if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) { Range.End = TmpVF; break; } @@ -7090,8 +7185,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, assert((isa(I) || isa(I)) && "Must be called with either a load or store"); - auto willWiden = [&](unsigned VF) -> bool { - if (VF == 1) + auto willWiden = [&](ElementCount VF) -> bool { + assert(!VF.Scalable && "unexpected scalable ElementCount"); + if (VF.isScalar()) return false; LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); @@ -7144,9 +7240,10 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, // Determine whether \p K is a truncation based on an induction variable that // can be optimized. auto isOptimizableIVTruncate = - [&](Instruction *K) -> std::function { - return - [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; + [&](Instruction *K) -> std::function { + return [=](ElementCount VF) -> bool { + return CM.isOptimizableIVTruncate(K, VF); + }; }; if (LoopVectorizationPlanner::getDecisionAndClampRange( @@ -7181,7 +7278,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, VPlan &Plan) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, + [this, CI](ElementCount VF) { + return CM.isScalarWithPredication(CI, VF); + }, Range); if (IsPredicated) @@ -7192,7 +7291,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) return nullptr; - auto willWiden = [&](unsigned VF) -> bool { + auto willWiden = [&](ElementCount VF) -> bool { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // The following case may be scalarized depending on the VF. // The flag shows whether we use Intrinsic or a usual Call for vectorized @@ -7216,7 +7315,7 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { !isa(I) && "Instruction should have been handled earlier"); // Instruction should be widened, unless it is scalar after vectorization, // scalarization is profitable or it is predicated. - auto WillScalarize = [this, I](unsigned VF) -> bool { + auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I, VF); @@ -7279,11 +7378,12 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( DenseMap &PredInst2Recipe, VPlanPtr &Plan) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, + [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); + [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, + Range); auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), IsUniform, IsPredicated); @@ -7491,8 +7591,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](unsigned VF) -> bool { - return (VF >= 2 && // Query is illegal for VF == 1 + auto applyIG = [IG, this](ElementCount VF) -> bool { + return (VF.isVector() && // Query is illegal for VF == 1 CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); }; @@ -7617,10 +7717,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( std::string PlanName; raw_string_ostream RSO(PlanName); - unsigned VF = Range.Start; + ElementCount VF = ElementCount::getFixed(Range.Start); Plan->addVF(VF); RSO << "Initial VPlan for VF={" << VF; - for (VF *= 2; VF < Range.End; VF *= 2) { + for (VF.Min *= 2; VF.Min < Range.End; VF.Min *= 2) { Plan->addVF(VF); RSO << "," << VF; } @@ -7647,7 +7747,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { HCFGBuilder.buildHierarchicalCFG(); for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) - Plan->addVF(VF); + Plan->addVF(ElementCount::getFixed(VF)); if (EnableVPlanPredication) { VPlanPredicator VPP(*Plan); @@ -7841,11 +7941,12 @@ void VPReplicateRecipe::execute(VPTransformState &State) { State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, IsPredicated, State); // Insert scalar instance packing it into a vector. - if (AlsoPack && State.VF > 1) { + if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from undef. if (State.Instance->Lane == 0) { - Value *Undef = UndefValue::get( - FixedVectorType::get(Ingredient->getType(), State.VF)); + assert(!State.VF.Scalable && "VF is assumed to be non scalable."); + Value *Undef = + UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); } State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); @@ -7856,7 +7957,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { // Generate scalar instances for all VF lanes of all UF parts, unless the // instruction is uniform inwhich case generate only the first lane for each // of the UF parts. - unsigned EndLane = IsUniform ? 1 : State.VF; + unsigned EndLane = IsUniform ? 1 : State.VF.Min; for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, @@ -8002,7 +8103,8 @@ static bool processLoopInVPlanNativePath( const unsigned UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); + const VectorizationFactor VF = + LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. @@ -8168,7 +8270,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = LVP.plan(UserVF, UserIC); + Optional MaybeVF = + LVP.plan(ElementCount::getFixed(UserVF), UserIC); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 302a4845e9a8..1358f9d37c87 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -300,7 +300,8 @@ void VPRegionBlock::execute(VPTransformState *State) { for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { State->Instance->Part = Part; - for (unsigned Lane = 0, VF = State->VF; Lane < VF; ++Lane) { + assert(!State->VF.Scalable && "VF is assumed to be non scalable."); + for (unsigned Lane = 0, VF = State->VF.Min; Lane < VF; ++Lane) { State->Instance->Lane = Lane; // Visit the VPBlocks connected to \p this, starting from it. for (VPBlockBase *Block : RPOT) { @@ -387,7 +388,7 @@ void VPInstruction::generateInstruction(VPTransformState &State, Value *ScalarBTC = State.get(getOperand(1), {Part, 0}); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = FixedVectorType::get(Int1Ty, State.VF); + auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.Min); Instruction *Call = Builder.CreateIntrinsic( Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()}, {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask"); @@ -838,14 +839,15 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { Value *CanonicalIV = State.CanonicalIV; Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); - auto VF = State.VF; - Value *VStart = VF == 1 - ? CanonicalIV - : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); + ElementCount VF = State.VF; + assert(!VF.Scalable && "the code following assumes non scalables ECs"); + Value *VStart = VF.isScalar() ? CanonicalIV + : Builder.CreateVectorSplat(VF.Min, CanonicalIV, + "broadcast"); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { SmallVector Indices; - for (unsigned Lane = 0; Lane < VF; ++Lane) - Indices.push_back(ConstantInt::get(STy, Part * VF + Lane)); + for (unsigned Lane = 0; Lane < VF.Min; ++Lane) + Indices.push_back(ConstantInt::get(STy, Part * VF.Min + Lane)); // If VF == 1, there is only one iteration in the loop above, thus the // element pushed back into Indices is ConstantInt::get(STy, Part) Constant *VStep = VF == 1 ? Indices.back() : ConstantVector::get(Indices); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 54700cb48839..6eed236fc149 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -115,7 +115,7 @@ private: /// The vectorization factor. Each entry in the scalar map contains UF x VF /// scalar values. - unsigned VF; + ElementCount VF; /// The vector and scalar map storage. We use std::map and not DenseMap /// because insertions to DenseMap invalidate its iterators. @@ -126,7 +126,7 @@ private: public: /// Construct an empty map with the given unroll and vectorization factors. - VectorizerValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} + VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {} /// \return True if the map has any vector entry for \p Key. bool hasAnyVectorValue(Value *Key) const { @@ -151,12 +151,14 @@ public: /// \return True if the map has a scalar entry for \p Key and \p Instance. bool hasScalarValue(Value *Key, const VPIteration &Instance) const { assert(Instance.Part < UF && "Queried Scalar Part is too large."); - assert(Instance.Lane < VF && "Queried Scalar Lane is too large."); + assert(Instance.Lane < VF.Min && "Queried Scalar Lane is too large."); + assert(!VF.Scalable && "VF is assumed to be non scalable."); + if (!hasAnyScalarValue(Key)) return false; const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Instance.Part].size() == VF && + assert(Entry[Instance.Part].size() == VF.Min && "ScalarParts has wrong dimensions."); return Entry[Instance.Part][Instance.Lane] != nullptr; } @@ -195,7 +197,7 @@ public: // TODO: Consider storing uniform values only per-part, as they occupy // lane 0 only, keeping the other VF-1 redundant entries null. for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF, nullptr); + Entry[Part].resize(VF.Min, nullptr); ScalarMapStorage[Key] = Entry; } ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; @@ -234,14 +236,15 @@ struct VPCallback { /// VPTransformState holds information passed down when "executing" a VPlan, /// needed for generating the output IR. struct VPTransformState { - VPTransformState(unsigned VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, - IRBuilder<> &Builder, VectorizerValueMap &ValueMap, - InnerLoopVectorizer *ILV, VPCallback &Callback) + VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, + DominatorTree *DT, IRBuilder<> &Builder, + VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV, + VPCallback &Callback) : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {} /// The chosen Vectorization and Unroll Factors of the loop being vectorized. - unsigned VF; + ElementCount VF; unsigned UF; /// Hold the indices to generate specific scalar instructions. Null indicates @@ -1583,7 +1586,7 @@ class VPlan { VPBlockBase *Entry; /// Holds the VFs applicable to this VPlan. - SmallSet VFs; + SmallSetVector VFs; /// Holds the name of the VPlan, for printing. std::string Name; @@ -1647,9 +1650,9 @@ public: return BackedgeTakenCount; } - void addVF(unsigned VF) { VFs.insert(VF); } + void addVF(ElementCount VF) { VFs.insert(VF); } - bool hasVF(unsigned VF) { return VFs.count(VF); } + bool hasVF(ElementCount VF) { return VFs.count(VF); } const std::string &getName() const { return Name; }