From bd6dc142306f5caaa87725f5f58c29b37da9cc32 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Sun, 20 Aug 2017 23:17:11 +0000 Subject: [PATCH] Revert r311077: [LV] Using VPlan ... This causes LLVM to assert fail on PPC64 and crash / infloop in other cases. Filed http://llvm.org/PR34248 with reproducer attached. llvm-svn: 311304 --- llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 - .../Transforms/Vectorize/LoopVectorize.cpp | 1587 ++++++----------- llvm/lib/Transforms/Vectorize/VPlan.cpp | 401 ----- llvm/lib/Transforms/Vectorize/VPlan.h | 789 -------- .../AArch64/aarch64-predication.ll | 12 +- .../AArch64/predication_costs.ll | 22 +- .../SystemZ/load-store-scalarization-cost.ll | 6 +- .../LoopVectorize/first-order-recurrence.ll | 7 + .../LoopVectorize/if-pred-non-void.ll | 12 +- 9 files changed, 558 insertions(+), 2279 deletions(-) delete mode 100644 llvm/lib/Transforms/Vectorize/VPlan.cpp delete mode 100644 llvm/lib/Transforms/Vectorize/VPlan.h diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 7622ed6d194f..1aea73cd4a32 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -3,7 +3,6 @@ add_llvm_library(LLVMVectorize LoopVectorize.cpp SLPVectorizer.cpp Vectorize.cpp - VPlan.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6c417ba340f9..44ffcfd3de06 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -47,7 +47,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/LoopVectorize.h" -#include "VPlan.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" @@ -99,7 +98,6 @@ #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Vectorize.h" #include -#include #include #include @@ -252,10 +250,6 @@ class LoopVectorizeHints; class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizationRequirements; -class VPInterleaveRecipe; -class VPReplicateRecipe; -class VPWidenIntOrFpInductionRecipe; -class VPWidenRecipe; /// Returns true if the given loop body has a cycle, excluding the loop /// itself. @@ -368,9 +362,6 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { : ConstantFP::get(Ty, C); } -} // end anonymous namespace - -namespace llvm { /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -402,11 +393,10 @@ public: AddedSafetyChecks(false) {} /// Create a new empty loop. Unlink the old loop and connect the new one. - /// Return the pre-header block of the new loop. - BasicBlock *createVectorizedLoopSkeleton(); + void createVectorizedLoopSkeleton(); - /// Widen a single instruction within the innermost loop. - void widenInstruction(Instruction &I); + /// Vectorize a single instruction within the innermost loop. + void vectorizeInstruction(Instruction &I); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(); @@ -416,72 +406,15 @@ public: virtual ~InnerLoopVectorizer() {} +protected: + /// A small list of PHINodes. + typedef SmallVector PhiVector; + /// A type for vectorized values in the new loop. Each value from the /// original loop, when vectorized, is represented by UF vector values in the /// new unrolled loop, where UF is the unroll factor. typedef SmallVector VectorParts; - /// A helper function that computes the predicate of the block BB, assuming - /// that the header block of the loop is set to True. It returns the *entry* - /// mask for the block BB. - VectorParts createBlockInMask(BasicBlock *BB); - - /// Vectorize a single PHINode in a block. This method handles the induction - /// variable canonicalization. It supports both VF = 1 for unrolled loops and - /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); - - /// A helper function to scalarize a single Instruction in the innermost loop. - /// Generates a sequence of scalar instances for each lane between \p MinLane - /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, - /// inclusive.. - void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, - bool IfPredicateInstr); - - /// Widen an integer or floating-point induction variable \p IV. If \p Trunc - /// is provided, the integer induction variable will first be truncated to - /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); - - /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a - /// vector or scalar value on-demand if one is not yet available. When - /// vectorizing a loop, we visit the definition of an instruction before its - /// uses. When visiting the definition, we either vectorize or scalarize the - /// instruction, creating an entry for it in the corresponding map. (In some - /// cases, such as induction variables, we will create both vector and scalar - /// entries.) Then, as we encounter uses of the definition, we derive values - /// for each scalar or vector use unless such a value is already available. - /// For example, if we scalarize a definition and one of its uses is vector, - /// we build the required vector on-demand with an insertelement sequence - /// when visiting the use. Otherwise, if the use is scalar, we can use the - /// existing scalar definition. - /// - /// Return a value in the new loop corresponding to \p V from the original - /// loop at unroll index \p Part. If the value has already been vectorized, - /// the corresponding vector entry in VectorLoopValueMap is returned. If, - /// however, the value has a scalar entry in VectorLoopValueMap, we construct - /// a new vector value on-demand by inserting the scalar values into a vector - /// with an insertelement sequence. If the value has been neither vectorized - /// nor scalarized, it must be loop invariant, so we simply broadcast the - /// value into a vector. - Value *getOrCreateVectorValue(Value *V, unsigned Part); - - /// Return a value in the new loop corresponding to \p V from the original - /// loop at unroll and vector indices \p Instance. If the value has been - /// vectorized but not scalarized, the necessary extractelement instruction - /// will be generated. - Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); - - /// Construct the vector value of a scalarized value \p V one lane at a time. - void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - - /// Try to vectorize the interleaved access group that \p Instr belongs to. - void vectorizeInterleaveGroup(Instruction *Instr); - -protected: - /// A small list of PHINodes. - typedef SmallVector PhiVector; - /// A type for scalarized values in the new loop. Each value from the /// original loop, when scalarized, is represented by UF x VF scalar values /// in the new unrolled loop, where UF is the unroll factor and VF is the @@ -524,18 +457,37 @@ protected: /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); + /// Predicate conditional instructions that require predication on their + /// respective conditions. + void predicateInstructions(); + /// Shrinks vector element sizes to the smallest bitwidth they can be legally /// represented as. void truncateToMinimalBitwidths(); + /// A helper function that computes the predicate of the block BB, assuming + /// that the header block of the loop is set to True. It returns the *entry* + /// mask for the block BB. + VectorParts createBlockInMask(BasicBlock *BB); /// A helper function that computes the predicate of the edge between SRC /// and DST. VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); + /// Vectorize a single PHINode in a block. This method handles the induction + /// variable canonicalization. It supports both VF = 1 for unrolled loops and + /// arbitrary length vectors. + void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); + /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. void updateAnalysis(); + /// This instruction is un-vectorizable. Implement it as a sequence + /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each + /// scalarized instruction behind an if block predicated on the control + /// dependence of the instruction. + void scalarizeInstruction(Instruction *Instr, bool IfPredicateInstr = false); + /// Vectorize Load and Store instructions, virtual void vectorizeMemoryInstruction(Instruction *Instr); @@ -569,6 +521,11 @@ protected: void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, Value *Step, Instruction *EntryVal); + /// Widen an integer or floating-point induction variable \p IV. If \p Trunc + /// is provided, the integer induction variable will first be truncated to + /// the corresponding type. + void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); + /// Returns true if an instruction \p I should be scalarized instead of /// vectorized for the chosen vectorization factor. bool shouldScalarizeInstruction(Instruction *I) const; @@ -576,6 +533,38 @@ protected: /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; + /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a + /// vector or scalar value on-demand if one is not yet available. When + /// vectorizing a loop, we visit the definition of an instruction before its + /// uses. When visiting the definition, we either vectorize or scalarize the + /// instruction, creating an entry for it in the corresponding map. (In some + /// cases, such as induction variables, we will create both vector and scalar + /// entries.) Then, as we encounter uses of the definition, we derive values + /// for each scalar or vector use unless such a value is already available. + /// For example, if we scalarize a definition and one of its uses is vector, + /// we build the required vector on-demand with an insertelement sequence + /// when visiting the use. Otherwise, if the use is scalar, we can use the + /// existing scalar definition. + /// + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll index \p Part. If the value has already been vectorized, + /// the corresponding vector entry in VectorLoopValueMap is returned. If, + /// however, the value has a scalar entry in VectorLoopValueMap, we construct + /// a new vector value on-demand by inserting the scalar values into a vector + /// with an insertelement sequence. If the value has been neither vectorized + /// nor scalarized, it must be loop invariant, so we simply broadcast the + /// value into a vector. + Value *getOrCreateVectorValue(Value *V, unsigned Part); + + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll index \p Part and vector index \p Lane. If the value has + /// been vectorized but not scalarized, the necessary extractelement + /// instruction will be generated. + Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane); + + /// Try to vectorize the interleaved access group that \p Instr belongs to. + void vectorizeInterleaveGroup(Instruction *Instr); + /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -616,6 +605,127 @@ protected: /// the instruction. void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); + /// This is a helper class for maintaining vectorization state. It's used for + /// mapping values from the original loop to their corresponding values in + /// the new loop. Two mappings are maintained: one for vectorized values and + /// one for scalarized values. Vectorized values are represented with UF + /// vector values in the new loop, and scalarized values are represented with + /// UF x VF scalar values in the new loop. UF and VF are the unroll and + /// vectorization factors, respectively. + /// + /// Entries can be added to either map with setVectorValue and setScalarValue, + /// which assert that an entry was not already added before. If an entry is to + /// replace an existing one, call resetVectorValue. This is currently needed + /// to modify the mapped values during "fix-up" operations that occur once the + /// first phase of widening is complete. These operations include type + /// truncation and the second phase of recurrence widening. + /// + /// Entries from either map can be retrieved using the getVectorValue and + /// getScalarValue functions, which assert that the desired value exists. + + struct ValueMap { + + /// Construct an empty map with the given unroll and vectorization factors. + ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} + + /// \return True if the map has any vector entry for \p Key. + bool hasAnyVectorValue(Value *Key) const { + return VectorMapStorage.count(Key); + } + + /// \return True if the map has a vector entry for \p Key and \p Part. + bool hasVectorValue(Value *Key, unsigned Part) const { + assert(Part < UF && "Queried Vector Part is too large."); + if (!hasAnyVectorValue(Key)) + return false; + const VectorParts &Entry = VectorMapStorage.find(Key)->second; + assert(Entry.size() == UF && "VectorParts has wrong dimensions."); + return Entry[Part] != nullptr; + } + + /// \return True if the map has any scalar entry for \p Key. + bool hasAnyScalarValue(Value *Key) const { + return ScalarMapStorage.count(Key); + } + + /// \return True if the map has a scalar entry for \p Key, \p Part and + /// \p Part. + bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const { + assert(Part < UF && "Queried Scalar Part is too large."); + assert(Lane < VF && "Queried Scalar Lane is too large."); + if (!hasAnyScalarValue(Key)) + return false; + const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; + assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); + assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions."); + return Entry[Part][Lane] != nullptr; + } + + /// Retrieve the existing vector value that corresponds to \p Key and + /// \p Part. + Value *getVectorValue(Value *Key, unsigned Part) { + assert(hasVectorValue(Key, Part) && "Getting non-existent value."); + return VectorMapStorage[Key][Part]; + } + + /// Retrieve the existing scalar value that corresponds to \p Key, \p Part + /// and \p Lane. + Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) { + assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value."); + return ScalarMapStorage[Key][Part][Lane]; + } + + /// Set a vector value associated with \p Key and \p Part. Assumes such a + /// value is not already set. If it is, use resetVectorValue() instead. + void setVectorValue(Value *Key, unsigned Part, Value *Vector) { + assert(!hasVectorValue(Key, Part) && "Vector value already set for part"); + if (!VectorMapStorage.count(Key)) { + VectorParts Entry(UF); + VectorMapStorage[Key] = Entry; + } + VectorMapStorage[Key][Part] = Vector; + } + + /// Set a scalar value associated with \p Key for \p Part and \p Lane. + /// Assumes such a value is not already set. + void setScalarValue(Value *Key, unsigned Part, unsigned Lane, + Value *Scalar) { + assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set"); + if (!ScalarMapStorage.count(Key)) { + ScalarParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part].resize(VF, nullptr); + // TODO: Consider storing uniform values only per-part, as they occupy + // lane 0 only, keeping the other VF-1 redundant entries null. + ScalarMapStorage[Key] = Entry; + } + ScalarMapStorage[Key][Part][Lane] = Scalar; + } + + /// Reset the vector value associated with \p Key for the given \p Part. + /// This function can be used to update values that have already been + /// vectorized. This is the case for "fix-up" operations including type + /// truncation and the second phase of recurrence vectorization. + void resetVectorValue(Value *Key, unsigned Part, Value *Vector) { + assert(hasVectorValue(Key, Part) && "Vector value not set for part"); + VectorMapStorage[Key][Part] = Vector; + } + + private: + /// The unroll factor. Each entry in the vector map contains UF vector + /// values. + unsigned UF; + + /// The vectorization factor. Each entry in the scalar map contains UF x VF + /// scalar values. + unsigned VF; + + /// The vector and scalar map storage. We use std::map and not DenseMap + /// because insertions to DenseMap invalidate its iterators. + std::map VectorMapStorage; + std::map ScalarMapStorage; + }; + /// The original loop. Loop *OrigLoop; /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies @@ -648,6 +758,7 @@ protected: /// vector elements. unsigned VF; +protected: /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -681,10 +792,11 @@ protected: /// vectorized loop. A key value can map to either vector values, scalar /// values or both kinds of values, depending on whether the key was /// vectorized and scalarized. - VectorizerValueMap VectorLoopValueMap; + ValueMap VectorLoopValueMap; - /// Store instructions that were predicated. - SmallVector PredicatedInstructions; + /// Store instructions that should be predicated, as a pair + /// + SmallVector, 4> PredicatedInstructions; EdgeMaskCacheTy EdgeMaskCache; BlockMaskCacheTy BlockMaskCache; /// Trip count of the original loop. @@ -704,8 +816,6 @@ protected: // Holds the end values for each induction variable. We save the end values // so we can later fix-up the external users of the induction variables. DenseMap IVEndValues; - - friend class LoopVectorizationPlanner; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -721,6 +831,7 @@ public: UnrollFactor, LVL, CM) {} private: + void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; Value *getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps Opcode = @@ -797,10 +908,6 @@ void InnerLoopVectorizer::addMetadata(ArrayRef To, } } -} // namespace llvm - -namespace { - /// \brief The group of interleaved loads/stores sharing the same stride and /// close to each other. /// @@ -1822,7 +1929,6 @@ public: /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. bool isProfitableToScalarize(Instruction *I, unsigned VF) const { - assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); auto Scalars = InstsToScalarize.find(VF); assert(Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); @@ -1936,22 +2042,6 @@ public: return Legal->isInductionVariable(Op); } - /// Collects the instructions to scalarize for each predicated instruction in - /// the loop. - void collectInstsToScalarize(unsigned VF); - - /// Collect Uniform and Scalar values for the given \p VF. - /// The sets depend on CM decision for Load/Store instructions - /// that may be vectorized as interleave, gather-scatter or scalarized. - void collectUniformsAndScalars(unsigned VF) { - // Do the analysis once. - if (VF == 1 || Uniforms.count(VF)) - return; - setCostBasedWideningDecision(VF); - collectLoopUniforms(VF); - collectLoopScalars(VF); - } - private: /// \return An upper bound for the vectorization factor, larger than zero. /// One is returned if vectorization should best be avoided due to cost. @@ -2053,6 +2143,10 @@ private: int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, unsigned VF); + /// Collects the instructions to scalarize for each predicated instruction in + /// the loop. + void collectInstsToScalarize(unsigned VF); + /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in /// the vectorized loop corresponding to each vector iteration. Examples of @@ -2071,6 +2165,18 @@ private: /// iteration of the original scalar loop. void collectLoopScalars(unsigned VF); + /// Collect Uniform and Scalar values for the given \p VF. + /// The sets depend on CM decision for Load/Store instructions + /// that may be vectorized as interleave, gather-scatter or scalarized. + void collectUniformsAndScalars(unsigned VF) { + // Do the analysis once. + if (VF == 1 || Uniforms.count(VF)) + return; + setCostBasedWideningDecision(VF); + collectLoopUniforms(VF); + collectLoopScalars(VF); + } + /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. typedef DenseMap, @@ -2108,71 +2214,23 @@ public: SmallPtrSet VecValuesToIgnore; }; -} // end anonymous namespace - -namespace llvm { -/// InnerLoopVectorizer vectorizes loops which contain only one basic /// LoopVectorizationPlanner - drives the vectorization process after having /// passed Legality checks. -/// The planner builds and optimizes the Vectorization Plans which record the -/// decisions how to vectorize the given loop. In particular, represent the -/// control-flow of the vectorized version, the replication of instructions that -/// are to be scalarized, and interleave access groups. class LoopVectorizationPlanner { - /// The loop that we evaluate. - Loop *OrigLoop; - - /// Loop Info analysis. - LoopInfo *LI; - - /// Target Library Info. - const TargetLibraryInfo *TLI; - - /// Target Transform Info. - const TargetTransformInfo *TTI; - - /// The legality analysis. - LoopVectorizationLegality *Legal; - - /// The profitablity analysis. - LoopVectorizationCostModel &CM; - - SmallVector VPlans; - - unsigned BestVF; - unsigned BestUF; - public: - LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, + LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), - BestVF(0), BestUF(0) {} + : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {} - ~LoopVectorizationPlanner() { - while (!VPlans.empty()) { - VPlan *Plan = VPlans.back(); - VPlans.pop_back(); - delete Plan; - } - } + ~LoopVectorizationPlanner() {} /// Plan how to best vectorize, return the best VF and its cost. LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, unsigned UserVF); - /// Finalize the best decision and dispose of all other VPlans. - void setBestPlan(unsigned VF, unsigned UF); - - /// Generate the IR code for the body of the vectorized loop according to the - /// best selected VPlan. - void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); - - void printPlans(raw_ostream &O) { - for (VPlan *Plan : VPlans) - O << *Plan; - } + /// Generate the IR code for the vectorized loop. + void executePlan(InnerLoopVectorizer &ILV); protected: /// Collect the instructions from the original loop that would be trivially @@ -2180,77 +2238,20 @@ protected: void collectTriviallyDeadInstructions( SmallPtrSetImpl &DeadInstructions); - /// A range of powers-of-2 vectorization factors with fixed start and - /// adjustable end. The range includes start and excludes end, e.g.,: - /// [1, 9) = {1, 2, 4, 8} - struct VFRange { - const unsigned Start; // A power of 2. - unsigned End; // Need not be a power of 2. If End <= Start range is empty. - }; - - /// Test a \p Predicate on a \p Range of VF's. Return the value of applying - /// \p Predicate on Range.Start, possibly decreasing Range.End such that the - /// returned value holds for the entire \p Range. - bool getDecisionAndClampRange(const std::function &Predicate, - VFRange &Range); - - /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, - /// according to the information gathered by Legal when it checked if it is - /// legal to vectorize the loop. - void buildVPlans(unsigned MinVF, unsigned MaxVF); - private: - /// Check if \I belongs to an Interleave Group within the given VF \p Range, - /// \return true in the first returned value if so and false otherwise. - /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG - /// for \p Range.Start, and provide it as the second returned value. - /// Note that if \I is an adjunct member of an IG for \p Range.Start, the - /// \return value is , as it is handled by another recipe. - /// \p Range.End may be decreased to ensure same decision from \p Range.Start - /// to \p Range.End. - VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); + /// The loop that we evaluate. + Loop *OrigLoop; - /// Check if an induction recipe should be constructed for \I within the given - /// VF \p Range. If so build and return it. If not, return null. \p Range.End - /// may be decreased to ensure same decision from \p Range.Start to - /// \p Range.End. - VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I, - VFRange &Range); + /// Loop Info analysis. + LoopInfo *LI; - /// Check if \I can be widened within the given VF \p Range. If \I can be - /// widened for Range.Start, extend \p LastWidenRecipe to include \p I if - /// possible or else build a new VPWidenRecipe for it, and return the - /// VPWidenRecipe that includes \p I. If \p I cannot be widened for - /// Range.Start \return null. Range.End may be decreased to ensure same - /// decision from \p Range.Start to \p Range.End. - VPWidenRecipe *tryToWiden(Instruction *I, VPWidenRecipe *LastWidenRecipe, - VFRange &Range); + /// The legality analysis. + LoopVectorizationLegality *Legal; - /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it - /// is predicated. \return \p VPBB augmented with this new recipe if \p I is - /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new - /// Region. Update the packing decision of predicated instructions if they - /// feed \p I. Range.End may be decreased to ensure same recipe behavior from - /// \p Range.Start to \p Range.End. - VPBasicBlock *handleReplication( - Instruction *I, VFRange &Range, VPBasicBlock *VPBB, - DenseMap &PredInst2Recipe); - - /// Create a replicating region for instruction \p I that requires - /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. - VPRegionBlock *createReplicateRegion(Instruction *I, - VPRecipeBase *PredRecipe); - - /// Build a VPlan according to the information gathered by Legal. \return a - /// VPlan for vectorization factors \p Range.Start and up to \p Range.End - /// exclusive, possibly decreasing \p Range.End. - VPlan *buildVPlan(VFRange &Range); + /// The profitablity analysis. + LoopVectorizationCostModel &CM; }; -} // namespace llvm - -namespace { - /// \brief This holds vectorization requirements that must be verified late in /// the process. The requirements are set by legalize and costmodel. Once /// vectorization has been determined to be possible and profitable the @@ -2671,15 +2672,15 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 - : VF; + Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 : VF; + // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); - VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); + VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add); } } } @@ -2717,7 +2718,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // vector form, we will construct the vector values on demand. if (VectorLoopValueMap.hasAnyScalarValue(V)) { - Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); + Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0); // If we've scalarized a value, that value should be an instruction. auto *I = cast(V); @@ -2734,8 +2735,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; - auto *LastInst = cast( - VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); + auto *LastInst = + cast(VectorLoopValueMap.getScalarValue(V, Part, LastLane)); // Set the insert point after the last scalarized instruction. This ensures // the insertelement sequence will directly follow the scalar definitions. @@ -2752,15 +2753,14 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { Value *VectorValue = nullptr; if (Cost->isUniformAfterVectorization(I, VF)) { VectorValue = getBroadcastInstrs(ScalarValue); - VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { - // Initialize packing with insertelements to start from undef. - Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); - VectorLoopValueMap.setVectorValue(V, Part, Undef); + VectorValue = UndefValue::get(VectorType::get(V->getType(), VF)); for (unsigned Lane = 0; Lane < VF; ++Lane) - packScalarIntoVectorValue(V, {Part, Lane}); - VectorValue = VectorLoopValueMap.getVectorValue(V, Part); + VectorValue = Builder.CreateInsertElement( + VectorValue, getOrCreateScalarValue(V, Part, Lane), + Builder.getInt32(Lane)); } + VectorLoopValueMap.setVectorValue(V, Part, VectorValue); Builder.restoreIP(OldIP); return VectorValue; } @@ -2772,29 +2772,28 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { return B; } -Value * -InnerLoopVectorizer::getOrCreateScalarValue(Value *V, - const VPIteration &Instance) { +Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part, + unsigned Lane) { + // If the value is not an instruction contained in the loop, it should // already be scalar. if (OrigLoop->isLoopInvariant(V)) return V; - assert(Instance.Lane > 0 - ? !Cost->isUniformAfterVectorization(cast(V), VF) - : true && "Uniform values only have lane zero"); + assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast(V), VF) + : true && "Uniform values only have lane zero"); // If the value from the original loop has not been vectorized, it is // represented by UF x VF scalar values in the new loop. Return the requested // scalar value. - if (VectorLoopValueMap.hasScalarValue(V, Instance)) - return VectorLoopValueMap.getScalarValue(V, Instance); + if (VectorLoopValueMap.hasScalarValue(V, Part, Lane)) + return VectorLoopValueMap.getScalarValue(V, Part, Lane); // If the value has not been scalarized, get its entry in VectorLoopValueMap // for the given unroll part. If this entry is not a vector type (i.e., the // vectorization factor is one), there is no need to generate an // extractelement instruction. - auto *U = getOrCreateVectorValue(V, Instance.Part); + auto *U = getOrCreateVectorValue(V, Part); if (!U->getType()->isVectorTy()) { assert(VF == 1 && "Value not scalarized has non-vector type"); return U; @@ -2803,20 +2802,7 @@ InnerLoopVectorizer::getOrCreateScalarValue(Value *V, // Otherwise, the value from the original loop has been vectorized and is // represented by UF vector values. Extract and return the requested scalar // value from the appropriate vector lane. - return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); -} - -void InnerLoopVectorizer::packScalarIntoVectorValue( - Value *V, const VPIteration &Instance) { - assert(V != Induction && "The new induction variable should not be used."); - assert(!V->getType()->isVectorTy() && "Can't pack a vector"); - assert(!V->getType()->isVoidTy() && "Type does not produce a value"); - - Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); - Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); - VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, - Builder.getInt32(Instance.Lane)); - VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); + return Builder.CreateExtractElement(U, Builder.getInt32(Lane)); } Value *InnerLoopVectorizer::reverseVector(Value *Vec) { @@ -2889,7 +2875,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { Index += (VF - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { - Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); + Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -3015,6 +3001,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Alignment = DL.getABITypeAlignment(ScalarDataTy); unsigned AddressSpace = getMemInstAddressSpace(Instr); + // Scalarize the memory instruction if necessary. + if (Decision == LoopVectorizationCostModel::CM_Scalarize) + return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); + // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); @@ -3029,7 +3019,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Handle consecutive loads/stores. if (ConsecutiveStride) - Ptr = getOrCreateScalarValue(Ptr, {0, 0}); + Ptr = getOrCreateScalarValue(Ptr, 0, 0); VectorParts Mask = createBlockInMask(Instr->getParent()); // Handle Stores: @@ -3126,41 +3116,71 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { } void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, - const VPIteration &Instance, bool IfPredicateInstr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + DEBUG(dbgs() << "LV: Scalarizing" + << (IfPredicateInstr ? " and predicating:" : ":") << *Instr + << '\n'); + // Holds vector parameters or scalars, in case of uniform vals. + SmallVector Params; setDebugLocFromInst(Builder, Instr); // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Instruction *Cloned = Instr->clone(); - if (!IsVoidRetTy) - Cloned->setName(Instr->getName() + ".cloned"); - - // Replace the operands of the cloned instructions with their scalar - // equivalents in the new loop. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); - Cloned->setOperand(op, NewOp); - } - addNewMetadata(Cloned, Instr); - - // Place the cloned scalar in the new loop. - Builder.Insert(Cloned); - - // Add the cloned scalar to the scalar map entry. - VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); - - // If we just cloned a new assumption, add it the assumption cache. - if (auto *II = dyn_cast(Cloned)) - if (II->getIntrinsicID() == Intrinsic::assume) - AC->registerAssumption(II); - - // End if-block. + VectorParts Cond; if (IfPredicateInstr) - PredicatedInstructions.push_back(Cloned); + Cond = createBlockInMask(Instr->getParent()); + + // Determine the number of scalars we need to generate for each unroll + // iteration. If the instruction is uniform, we only need to generate the + // first lane. Otherwise, we generate all VF values. + unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF; + + // For each vector unroll 'part': + for (unsigned Part = 0; Part < UF; ++Part) { + // For each scalar that we create: + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + + // Start if-block. + Value *Cmp = nullptr; + if (IfPredicateInstr) { + Cmp = Cond[Part]; + if (!Cmp) // Block in mask is all-one. + Cmp = Builder.getTrue(); + else if (Cmp->getType()->isVectorTy()) + Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane)); + } + + Instruction *Cloned = Instr->clone(); + if (!IsVoidRetTy) + Cloned->setName(Instr->getName() + ".cloned"); + + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane); + Cloned->setOperand(op, NewOp); + } + addNewMetadata(Cloned, Instr); + + // Place the cloned scalar in the new loop. + Builder.Insert(Cloned); + + // Add the cloned scalar to the scalar map entry. + VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned); + + // If we just cloned a new assumption, add it the assumption cache. + if (auto *II = dyn_cast(Cloned)) + if (II->getIntrinsicID() == Intrinsic::assume) + AC->registerAssumption(II); + + // End if-block. + if (IfPredicateInstr) + PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); + } + } } PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, @@ -3364,7 +3384,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { LVer->prepareNoAliasMetadata(); } -BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { +void InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3545,8 +3565,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { LoopVectorizeHints Hints(Lp, true, *ORE); Hints.setAlreadyVectorized(); - - return LoopVectorPreHeader; } // Fix up external users of the induction variable. At this point, we are @@ -3920,8 +3938,7 @@ void InnerLoopVectorizer::fixVectorizedLoop() { IVEndValues[Entry.first], LoopMiddleBlock); fixLCSSAPHIs(); - for (Instruction *PI : PredicatedInstructions) - sinkScalarOperands(&*PI); + predicateInstructions(); // Remove redundant induction instructions. cse(LoopVectorBody); @@ -4376,6 +4393,126 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { } while (Changed); } +void InnerLoopVectorizer::predicateInstructions() { + + // For each instruction I marked for predication on value C, split I into its + // own basic block to form an if-then construct over C. Since I may be fed by + // an extractelement instruction or other scalar operand, we try to + // iteratively sink its scalar operands into the predicated block. If I feeds + // an insertelement instruction, we try to move this instruction into the + // predicated block as well. For non-void types, a phi node will be created + // for the resulting value (either vector or scalar). + // + // So for some predicated instruction, e.g. the conditional sdiv in: + // + // for.body: + // ... + // %add = add nsw i32 %mul, %0 + // %cmp5 = icmp sgt i32 %2, 7 + // br i1 %cmp5, label %if.then, label %if.end + // + // if.then: + // %div = sdiv i32 %0, %1 + // br label %if.end + // + // if.end: + // %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ] + // + // the sdiv at this point is scalarized and if-converted using a select. + // The inactive elements in the vector are not used, but the predicated + // instruction is still executed for all vector elements, essentially: + // + // vector.body: + // ... + // %17 = add nsw <2 x i32> %16, %wide.load + // %29 = extractelement <2 x i32> %wide.load, i32 0 + // %30 = extractelement <2 x i32> %wide.load51, i32 0 + // %31 = sdiv i32 %29, %30 + // %32 = insertelement <2 x i32> undef, i32 %31, i32 0 + // %35 = extractelement <2 x i32> %wide.load, i32 1 + // %36 = extractelement <2 x i32> %wide.load51, i32 1 + // %37 = sdiv i32 %35, %36 + // %38 = insertelement <2 x i32> %32, i32 %37, i32 1 + // %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17 + // + // Predication will now re-introduce the original control flow to avoid false + // side-effects by the sdiv instructions on the inactive elements, yielding + // (after cleanup): + // + // vector.body: + // ... + // %5 = add nsw <2 x i32> %4, %wide.load + // %8 = icmp sgt <2 x i32> %wide.load52, + // %9 = extractelement <2 x i1> %8, i32 0 + // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue + // + // pred.sdiv.if: + // %10 = extractelement <2 x i32> %wide.load, i32 0 + // %11 = extractelement <2 x i32> %wide.load51, i32 0 + // %12 = sdiv i32 %10, %11 + // %13 = insertelement <2 x i32> undef, i32 %12, i32 0 + // br label %pred.sdiv.continue + // + // pred.sdiv.continue: + // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ] + // %15 = extractelement <2 x i1> %8, i32 1 + // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55 + // + // pred.sdiv.if54: + // %16 = extractelement <2 x i32> %wide.load, i32 1 + // %17 = extractelement <2 x i32> %wide.load51, i32 1 + // %18 = sdiv i32 %16, %17 + // %19 = insertelement <2 x i32> %14, i32 %18, i32 1 + // br label %pred.sdiv.continue55 + // + // pred.sdiv.continue55: + // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ] + // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5 + + for (auto KV : PredicatedInstructions) { + BasicBlock::iterator I(KV.first); + BasicBlock *Head = I->getParent(); + auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, + /*BranchWeights=*/nullptr, DT, LI); + I->moveBefore(T); + sinkScalarOperands(&*I); + + BasicBlock *PredicatedBlock = I->getParent(); + Twine BBNamePrefix = Twine("pred.") + I->getOpcodeName(); + PredicatedBlock->setName(BBNamePrefix + ".if"); + PredicatedBlock->getSingleSuccessor()->setName(BBNamePrefix + ".continue"); + + // If the instruction is non-void create a Phi node at reconvergence point. + if (!I->getType()->isVoidTy()) { + Value *IncomingTrue = nullptr; + Value *IncomingFalse = nullptr; + + if (I->hasOneUse() && isa(*I->user_begin())) { + // If the predicated instruction is feeding an insert-element, move it + // into the Then block; Phi node will be created for the vector. + InsertElementInst *IEI = cast(*I->user_begin()); + IEI->moveBefore(T); + IncomingTrue = IEI; // the new vector with the inserted element. + IncomingFalse = IEI->getOperand(0); // the unmodified vector + } else { + // Phi node will be created for the scalar predicated instruction. + IncomingTrue = &*I; + IncomingFalse = UndefValue::get(I->getType()); + } + + BasicBlock *PostDom = I->getParent()->getSingleSuccessor(); + assert(PostDom && "Then block has multiple successors"); + PHINode *Phi = + PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front()); + IncomingTrue->replaceAllUsesWith(Phi); + Phi->addIncoming(IncomingFalse, Head); + Phi->addIncoming(IncomingTrue, I->getParent()); + } + } + + DEBUG(DT->verifyDomTree()); +} + InnerLoopVectorizer::VectorParts InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); @@ -4519,7 +4656,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: case InductionDescriptor::IK_FpInduction: - llvm_unreachable("Integer/fp induction is handled elsewhere."); + return widenIntOrFpInduction(P); case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); @@ -4538,7 +4675,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL); SclrGep->setName("next.gep"); - VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); + VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep); } } return; @@ -4564,11 +4701,25 @@ static bool mayDivideByZero(Instruction &I) { return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::widenInstruction(Instruction &I) { +void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { + // Scalarize instructions that should remain scalar after vectorization. + if (VF > 1 && + !(isa(&I) || isa(&I) || isa(&I)) && + shouldScalarizeInstruction(&I)) { + scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); + return; + } + switch (I.getOpcode()) { case Instruction::Br: - case Instruction::PHI: - llvm_unreachable("This instruction is handled by a different recipe."); + // Nothing to do for PHIs and BR, since we already took care of the + // loop control flow instructions. + break; + case Instruction::PHI: { + // Vectorize PHINodes. + widenPHIInstruction(&I, UF, VF); + break; + } // End of PHI. case Instruction::GetElementPtr: { // Construct a vector GEP by widening the operands of the scalar GEP as // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP @@ -4641,6 +4792,13 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: + // Scalarize with predication if this instruction may divide by zero and + // block execution is conditional, otherwise fallthrough. + if (Legal->isScalarWithPredication(&I)) { + scalarizeInstruction(&I, true); + break; + } + LLVM_FALLTHROUGH; case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -4688,7 +4846,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. - auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); + auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0); for (unsigned Part = 0; Part < UF; ++Part) { Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); @@ -4747,6 +4905,16 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { auto *CI = dyn_cast(&I); setDebugLocFromInst(Builder, CI); + // Optimize the special case where the source is a constant integer + // induction variable. Notice that we can only optimize the 'trunc' case + // because (a) FP conversions lose precision, (b) sext/zext may wrap, and + // (c) other casts depend on pointer size. + if (Cost->isOptimizableIVTruncate(CI, VF)) { + widenIntOrFpInduction(cast(CI->getOperand(0)), + cast(CI)); + break; + } + /// Vectorize casts. Type *DestTy = (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); @@ -4777,7 +4945,11 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - + if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start)) { + scalarizeInstruction(&I); + break; + } // The flag shows whether we use Intrinsic or a usual Call for vectorized // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? @@ -4785,8 +4957,10 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); bool UseVectorIntrinsic = ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; - assert((UseVectorIntrinsic || !NeedToScalarize) && - "Instruction should be scalarized elsewhere."); + if (!UseVectorIntrinsic && NeedToScalarize) { + scalarizeInstruction(&I); + break; + } for (unsigned Part = 0; Part < UF; ++Part) { SmallVector Args; @@ -4836,9 +5010,9 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { } default: - // All other instructions are scalarized. - DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); + // All other instructions are unsupported. Scalarize them. + scalarizeInstruction(&I); + break; } // end of switch. } @@ -4850,11 +5024,14 @@ void InnerLoopVectorizer::updateAnalysis() { assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && "Entry does not dominate exit."); + DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(), + LoopVectorPreHeader); DT->addNewBlock(LoopMiddleBlock, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); + DEBUG(DT->verifyDomTree()); } @@ -6794,6 +6971,13 @@ LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; + // Collect Uniform and Scalar instructions after vectorization with VF. + collectUniformsAndScalars(VF); + + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + collectInstsToScalarize(VF); + // For each block. for (BasicBlock *BB : TheLoop->blocks()) { VectorizationCostTy BlockCost; @@ -7451,26 +7635,11 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); - buildVPlans(UserVF, UserVF); - DEBUG(printPlans(dbgs())); return {UserVF, 0}; } unsigned MaxVF = MaybeMaxVF.getValue(); assert(MaxVF != 0 && "MaxVF is zero."); - - for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { - // Collect Uniform and Scalar instructions after vectorization with VF. - CM.collectUniformsAndScalars(VF); - - // Collect the instructions (and their associated costs) that will be more - // profitable to scalarize. - if (VF > 1) - CM.collectInstsToScalarize(VF); - } - - buildVPlans(1, MaxVF); - DEBUG(printPlans(dbgs())); if (MaxVF == 1) return NoVectorization; @@ -7478,31 +7647,11 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { return CM.selectVectorizationFactor(MaxVF); } -void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { - DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); - BestVF = VF; - BestUF = UF; - - for (auto *VPlanIter = VPlans.begin(); VPlanIter != VPlans.end();) { - VPlan *Plan = *VPlanIter; - if (Plan->hasVF(VF)) - ++VPlanIter; - else { - VPlanIter = VPlans.erase(VPlanIter); - delete Plan; - } - } - assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); -} - -void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, - DominatorTree *DT) { +void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) { // Perform the actual loop transformation. // 1. Create a new empty loop. Unlink the old loop and connect the new one. - VPTransformState State{ - BestVF, BestUF, LI, DT, ILV.Builder, ILV.VectorLoopValueMap, &ILV}; - State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); + ILV.createVectorizedLoopSkeleton(); //===------------------------------------------------===// // @@ -7513,9 +7662,36 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - assert(VPlans.size() == 1 && "Not a single VPlan to execute."); - VPlan *Plan = *VPlans.begin(); - Plan->execute(&State); + + // Move instructions to handle first-order recurrences. + DenseMap &SinkAfter = Legal->getSinkAfter(); + for (auto &Entry : SinkAfter) { + Entry.first->removeFromParent(); + Entry.first->insertAfter(Entry.second); + DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second + << " to vectorize a 1st order recurrence.\n"); + } + + // Collect instructions from the original loop that will become trivially dead + // in the vectorized loop. We don't need to vectorize these instructions. For + // example, original induction update instructions can become dead because we + // separately emit induction "steps" when generating code for the new loop. + // Similarly, we create a new latch condition when setting up the structure + // of the new loop, so the old one can become dead. + SmallPtrSet DeadInstructions; + collectTriviallyDeadInstructions(DeadInstructions); + + // Scan the loop in a topological order to ensure that defs are vectorized + // before users. + LoopBlocksDFS DFS(OrigLoop); + DFS.perform(LI); + + // Vectorize all instructions in the original loop that will not become + // trivially dead when vectorized. + for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) + for (Instruction &I : *BB) + if (!DeadInstructions.count(&I)) + ILV.vectorizeInstruction(I); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7545,6 +7721,14 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions( DeadInstructions.insert(IndUpdate); } } + +void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { + auto *SI = dyn_cast(Instr); + bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); + + return scalarizeInstruction(Instr, IfPredicateInstr); +} + Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } @@ -7600,725 +7784,6 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } } -namespace { -/// VPWidenRecipe is a recipe for producing a copy of vector type for each -/// Instruction in its ingredients independently, in order. This recipe covers -/// most of the traditional vectorization cases where each ingredient transforms -/// into a vectorized version of itself. -class VPWidenRecipe : public VPRecipeBase { -private: - /// Hold the ingredients by pointing to their original BasicBlock location. - BasicBlock::iterator Begin; - BasicBlock::iterator End; - -public: - VPWidenRecipe(Instruction *I) : VPRecipeBase(VPWidenSC) { - End = I->getIterator(); - Begin = End++; - } - - ~VPWidenRecipe() {} - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPWidenSC; - } - - /// Produce widened copies of all Ingredients. - void execute(VPTransformState &State) override { - for (auto &Instr : make_range(Begin, End)) - State.ILV->widenInstruction(Instr); - } - - /// Augment the recipe to include Instr, if it lies at its End. - bool appendInstruction(Instruction *Instr) { - if (End != Instr->getIterator()) - return false; - End++; - return true; - } - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override { - O << " +\n" << Indent << "\"WIDEN\\l\""; - for (auto &Instr : make_range(Begin, End)) - O << " +\n" << Indent << "\" " << VPlanIngredient(&Instr) << "\\l\""; - } -}; - -/// A recipe for handling phi nodes of integer and floating-point inductions, -/// producing their vector and scalar values. -class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { -private: - PHINode *IV; - TruncInst *Trunc; - -public: - VPWidenIntOrFpInductionRecipe(PHINode *IV, TruncInst *Trunc = nullptr) - : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), Trunc(Trunc) {} - - ~VPWidenIntOrFpInductionRecipe() {} - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPWidenIntOrFpInductionSC; - } - - /// Generate the vectorized and scalarized versions of the phi node as - /// needed by their users. - void execute(VPTransformState &State) override { - assert(!State.Instance && "Int or FP induction being replicated."); - State.ILV->widenIntOrFpInduction(IV, Trunc); - } - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override { - O << " +\n" << Indent << "\"WIDEN-INDUCTION"; - if (Trunc) { - O << "\\l\""; - O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; - O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc) << "\\l\""; - } else - O << " " << VPlanIngredient(IV) << "\\l\""; - } -}; - -/// A recipe for handling all phi nodes except for integer and FP inductions. -class VPWidenPHIRecipe : public VPRecipeBase { -private: - PHINode *Phi; - -public: - VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {} - - ~VPWidenPHIRecipe() {} - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPWidenPHISC; - } - - /// Generate the phi/select nodes. - void execute(VPTransformState &State) override { - State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); - } - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override { - O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\""; - } -}; - -/// VPInterleaveRecipe is a recipe for transforming an interleave group of load -/// or stores into one wide load/store and shuffles. -class VPInterleaveRecipe : public VPRecipeBase { -private: - const InterleaveGroup *IG; - -public: - VPInterleaveRecipe(const InterleaveGroup *IG) - : VPRecipeBase(VPInterleaveSC), IG(IG) {} - - ~VPInterleaveRecipe() {} - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC; - } - - /// Generate the wide load or store, and shuffles. - void execute(VPTransformState &State) override { - assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); - } - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override; - - const InterleaveGroup *getInterleaveGroup() { return IG; } -}; - -/// VPReplicateRecipe replicates a given instruction producing multiple scalar -/// copies of the original scalar type, one per lane, instead of producing a -/// single copy of widened type for all lanes. If the instruction is known to be -/// uniform only one copy, per lane zero, will be generated. -class VPReplicateRecipe : public VPRecipeBase { -private: - /// The instruction being replicated. - Instruction *Ingredient; - - /// Indicator if only a single replica per lane is needed. - bool IsUniform; - - /// Indicator if the replicas are also predicated. - bool IsPredicated; - - /// Indicator if the scalar values should also be packed into a vector. - bool AlsoPack; - -public: - VPReplicateRecipe(Instruction *I, bool IsUniform, bool IsPredicated = false) - : VPRecipeBase(VPReplicateSC), Ingredient(I), IsUniform(IsUniform), - IsPredicated(IsPredicated) { - // Retain the previous behavior of predicateInstructions(), where an - // insert-element of a predicated instruction got hoisted into the - // predicated basic block iff it was its only user. This is achieved by - // having predicated instructions also pack their values into a vector by - // default unless they have a replicated user which uses their scalar value. - AlsoPack = IsPredicated && !I->use_empty(); - } - - ~VPReplicateRecipe() {} - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPReplicateSC; - } - - /// Generate replicas of the desired Ingredient. Replicas will be generated - /// for all parts and lanes unless a specific part and lane are specified in - /// the \p State. - void execute(VPTransformState &State) override; - - void setAlsoPack(bool Pack) { AlsoPack = Pack; } - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override { - O << " +\n" - << Indent << "\"" << (IsUniform ? "CLONE " : "REPLICATE ") - << VPlanIngredient(Ingredient); - if (AlsoPack) - O << " (S->V)"; - O << "\\l\""; - } -}; - -/// A recipe for generating conditional branches on the bits of a mask. -class VPBranchOnMaskRecipe : public VPRecipeBase { -private: - /// The input IR basic block used to obtain the mask providing the condition - /// bits for the branch. - BasicBlock *MaskedBasicBlock; - -public: - VPBranchOnMaskRecipe(BasicBlock *BB) - : VPRecipeBase(VPBranchOnMaskSC), MaskedBasicBlock(BB) {} - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPBranchOnMaskSC; - } - - /// Generate the extraction of the appropriate bit from the block mask and the - /// conditional branch. - void execute(VPTransformState &State) override; - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override { - O << " +\n" - << Indent << "\"BRANCH-ON-MASK-OF " << MaskedBasicBlock->getName() - << "\\l\""; - } -}; - -/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when -/// control converges back from a Branch-on-Mask. The phi nodes are needed in -/// order to merge values that are set under such a branch and feed their uses. -/// The phi nodes can be scalar or vector depending on the users of the value. -/// This recipe works in concert with VPBranchOnMaskRecipe. -class VPPredInstPHIRecipe : public VPRecipeBase { -private: - Instruction *PredInst; - -public: - /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi - /// nodes after merging back from a Branch-on-Mask. - VPPredInstPHIRecipe(Instruction *PredInst) - : VPRecipeBase(VPPredInstPHISC), PredInst(PredInst) {} - - ~VPPredInstPHIRecipe() {} - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPPredInstPHISC; - } - - /// Generates phi nodes for live-outs as needed to retain SSA form. - void execute(VPTransformState &State) override; - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override { - O << " +\n" - << Indent << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst) - << "\\l\""; - } -}; -} // end anonymous namespace - -bool LoopVectorizationPlanner::getDecisionAndClampRange( - const std::function &Predicate, VFRange &Range) { - assert(Range.End > Range.Start && "Trying to test an empty VF range."); - bool PredicateAtRangeStart = Predicate(Range.Start); - - for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) - if (Predicate(TmpVF) != PredicateAtRangeStart) { - Range.End = TmpVF; - break; - } - - return PredicateAtRangeStart; -} - -/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, -/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range -/// of VF's starting at a given VF and extending it as much as possible. Each -/// vectorization decision can potentially shorten this sub-range during -/// buildVPlan(). -void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { - for (unsigned VF = MinVF; VF < MaxVF + 1;) { - VFRange SubRange = {VF, MaxVF + 1}; - VPlan *Plan = buildVPlan(SubRange); - VPlans.push_back(Plan); - VF = SubRange.End; - } -} - -VPInterleaveRecipe * -LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I, - VFRange &Range) { - const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I); - if (!IG) - return nullptr; - - // Now check if IG is relevant for VF's in the given range. - auto isIGMember = [&](Instruction *I) -> std::function { - return [=](unsigned VF) -> bool { - return (VF >= 2 && // Query is illegal for VF == 1 - CM.getWideningDecision(I, VF) == - LoopVectorizationCostModel::CM_Interleave); - }; - }; - if (!getDecisionAndClampRange(isIGMember(I), Range)) - return nullptr; - - // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) - // range. If it's the primary member of the IG construct a VPInterleaveRecipe. - // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. - assert(I == IG->getInsertPos() && - "Generating a recipe for an adjunct member of an interleave group"); - - return new VPInterleaveRecipe(IG); -} - -VPWidenIntOrFpInductionRecipe * -LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I, - VFRange &Range) { - if (PHINode *Phi = dyn_cast(I)) { - // Check if this is an integer or fp induction. If so, build the recipe that - // produces its scalar and vector values. - InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); - if (II.getKind() == InductionDescriptor::IK_IntInduction || - II.getKind() == InductionDescriptor::IK_FpInduction) - return new VPWidenIntOrFpInductionRecipe(Phi); - - return nullptr; - } - - // Optimize the special case where the source is a constant integer - // induction variable. Notice that we can only optimize the 'trunc' case - // because (a) FP conversions lose precision, (b) sext/zext may wrap, and - // (c) other casts depend on pointer size. - - // Determine whether \p K is a truncation based on an induction variable that - // can be optimized. - auto isOptimizableIVTruncate = - [&](Instruction *K) -> std::function { - return - [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; - }; - - if (isa(I) && - getDecisionAndClampRange(isOptimizableIVTruncate(I), Range)) - return new VPWidenIntOrFpInductionRecipe(cast(I->getOperand(0)), - cast(I)); - return nullptr; -} - -VPWidenRecipe *LoopVectorizationPlanner::tryToWiden( - Instruction *I, VPWidenRecipe *LastWidenRecipe, VFRange &Range) { - - if (Legal->isScalarWithPredication(I)) - return nullptr; - - static DenseSet VectorizableOpcodes = { - Instruction::Br, Instruction::PHI, Instruction::GetElementPtr, - Instruction::UDiv, Instruction::SDiv, Instruction::SRem, - Instruction::URem, Instruction::Add, Instruction::FAdd, - Instruction::Sub, Instruction::FSub, Instruction::Mul, - Instruction::FMul, Instruction::FDiv, Instruction::FRem, - Instruction::Shl, Instruction::LShr, Instruction::AShr, - Instruction::And, Instruction::Or, Instruction::Xor, - Instruction::Select, Instruction::ICmp, Instruction::FCmp, - Instruction::Store, Instruction::Load, Instruction::ZExt, - Instruction::SExt, Instruction::FPToUI, Instruction::FPToSI, - Instruction::FPExt, Instruction::PtrToInt, Instruction::IntToPtr, - Instruction::SIToFP, Instruction::UIToFP, Instruction::Trunc, - Instruction::FPTrunc, Instruction::BitCast, Instruction::Call}; - - if (!VectorizableOpcodes.count(I->getOpcode())) - return nullptr; - - if (CallInst *CI = dyn_cast(I)) { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start)) - return nullptr; - } - - auto willWiden = [&](unsigned VF) -> bool { - if (!isa(I) && (CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF))) - return false; - if (CallInst *CI = dyn_cast(I)) { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - // The following case may be scalarized depending on the VF. - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); - bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; - return UseVectorIntrinsic || !NeedToScalarize; - } - if (isa(I) || isa(I)) { - LoopVectorizationCostModel::InstWidening Decision = - CM.getWideningDecision(I, VF); - assert(Decision != LoopVectorizationCostModel::CM_Unknown && - "CM decision should be taken at this point."); - assert(Decision != LoopVectorizationCostModel::CM_Interleave && - "Interleave memory opportunity should be caught earlier."); - return Decision != LoopVectorizationCostModel::CM_Scalarize; - } - return true; - }; - - if (!getDecisionAndClampRange(willWiden, Range)) - return nullptr; - - // Success: widen this instruction. We optimize the common case where - // consecutive instructions can be represented by a single recipe. - if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) - return LastWidenRecipe; - return new VPWidenRecipe(I); -} - -VPBasicBlock *LoopVectorizationPlanner::handleReplication( - Instruction *I, VFRange &Range, VPBasicBlock *VPBB, - DenseMap &PredInst2Recipe) { - - bool IsUniform = getDecisionAndClampRange( - [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, - Range); - - bool IsPredicated = Legal->isScalarWithPredication(I); - auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); - - // Find if I uses a predicated instruction. If so, it will use its scalar - // value. Avoid hoisting the insert-element which packs the scalar value into - // a vector value, as that happens iff all users use the vector value. - for (auto &Op : I->operands()) - if (auto *PredInst = dyn_cast(Op)) - if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) - PredInst2Recipe[PredInst]->setAlsoPack(false); - - // Finalize the recipe for Instr, first if it is not predicated. - if (!IsPredicated) { - DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); - VPBB->appendRecipe(Recipe); - return VPBB; - } - DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); - assert(VPBB->getSuccessors().empty() && - "VPBB has successors when handling predicated replication."); - // Record predicated instructions for above packing optimizations. - PredInst2Recipe[I] = Recipe; - VPBlockBase *Region = VPBB->setOneSuccessor(createReplicateRegion(I, Recipe)); - return cast(Region->setOneSuccessor(new VPBasicBlock())); -} - -VPRegionBlock * -LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr, - VPRecipeBase *PredRecipe) { - // Instructions marked for predication are replicated and placed under an - // if-then construct to prevent side-effects. - - // Build the triangular if-then region. - std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); - assert(Instr->getParent() && "Predicated instruction not in any basic block"); - auto *BOMRecipe = new VPBranchOnMaskRecipe(Instr->getParent()); - auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); - auto *PHIRecipe = - Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); - auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); - auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); - VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); - - // Note: first set Entry as region entry and then connect successors starting - // from it in order, to propagate the "parent" of each VPBasicBlock. - Entry->setTwoSuccessors(Pred, Exit); - Pred->setOneSuccessor(Exit); - - return Region; -} - -VPlan *LoopVectorizationPlanner::buildVPlan(VFRange &Range) { - - DenseMap &SinkAfter = Legal->getSinkAfter(); - DenseMap SinkAfterInverse; - - // Collect instructions from the original loop that will become trivially dead - // in the vectorized loop. We don't need to vectorize these instructions. For - // example, original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet DeadInstructions; - collectTriviallyDeadInstructions(DeadInstructions); - - // Hold a mapping from predicated instructions to their recipes, in order to - // fix their AlsoPack behavior if a user is determined to replicate and use a - // scalar instead of vector value. - DenseMap PredInst2Recipe; - - // Create a dummy pre-entry VPBasicBlock to start building the VPlan. - VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); - VPlan *Plan = new VPlan(VPBB); - - // Scan the body of the loop in a topological order to visit each basic block - // after having visited its predecessor basic blocks. - LoopBlocksDFS DFS(OrigLoop); - DFS.perform(LI); - - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { - // Relevant instructions from basic block BB will be grouped into VPRecipe - // ingredients and fill a new VPBasicBlock. - unsigned VPBBsForBB = 0; - auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); - VPBB->setOneSuccessor(FirstVPBBForBB); - VPBB = FirstVPBBForBB; - VPWidenRecipe *LastWidenRecipe = nullptr; - - std::vector Ingredients; - - // Organize the ingredients to vectorize from current basic block in the - // right order. - for (Instruction &I : *BB) { - Instruction *Instr = &I; - - // First filter out irrelevant instructions, to ensure no recipes are - // built for them. - if (isa(Instr) || isa(Instr) || - DeadInstructions.count(Instr)) - continue; - - // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct - // member of the IG, do not construct any Recipe for it. - const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr); - if (IG && Instr != IG->getInsertPos() && - Range.Start >= 2 && // Query is illegal for VF == 1 - CM.getWideningDecision(Instr, Range.Start) == - LoopVectorizationCostModel::CM_Interleave) - continue; - - // Move instructions to handle first-order recurrences, step 1: avoid - // handling this instruction until after we've handled the instruction it - // should follow. - auto SAIt = SinkAfter.find(Instr); - if (SAIt != SinkAfter.end()) { - DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second - << " to vectorize a 1st order recurrence.\n"); - SinkAfterInverse[SAIt->second] = Instr; - continue; - } - - Ingredients.push_back(Instr); - - // Move instructions to handle first-order recurrences, step 2: push the - // instruction to be sunk at its insertion point. - auto SAInvIt = SinkAfterInverse.find(Instr); - if (SAInvIt != SinkAfterInverse.end()) - Ingredients.push_back(SAInvIt->second); - } - - // Introduce each ingredient into VPlan. - for (Instruction *Instr : Ingredients) { - VPRecipeBase *Recipe = nullptr; - - // Check if Instr should belong to an interleave memory recipe, or already - // does. In the latter case Instr is irrelevant. - if ((Recipe = tryToInterleaveMemory(Instr, Range))) { - VPBB->appendRecipe(Recipe); - continue; - } - - // Check if Instr should form some PHI recipe. - if ((Recipe = tryToOptimizeInduction(Instr, Range))) { - VPBB->appendRecipe(Recipe); - continue; - } - if (PHINode *Phi = dyn_cast(Instr)) { - VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); - continue; - } - - // Check if Instr is to be widened by a general VPWidenRecipe, after - // having first checked for specific widening recipes that deal with - // Interleave Groups, Inductions and Phi nodes. - if ((Recipe = tryToWiden(Instr, LastWidenRecipe, Range))) { - if (Recipe != LastWidenRecipe) - VPBB->appendRecipe(Recipe); - LastWidenRecipe = cast(Recipe); - continue; - } - - // Otherwise, if all widening options failed, Instruction is to be - // replicated. This may create a successor for VPBB. - VPBasicBlock *NextVPBB = - handleReplication(Instr, Range, VPBB, PredInst2Recipe); - if (NextVPBB != VPBB) { - VPBB = NextVPBB; - VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) - : ""); - } - } - } - - // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks - // may also be empty, such as the last one VPBB, reflecting original - // basic-blocks with no recipes. - VPBasicBlock *PreEntry = cast(Plan->getEntry()); - assert(PreEntry->empty() && "Expecting empty pre-entry block."); - VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); - PreEntry->disconnectSuccessor(Entry); - delete PreEntry; - - std::string PlanName; - raw_string_ostream RSO(PlanName); - unsigned VF = Range.Start; - Plan->addVF(VF); - RSO << "Initial VPlan for VF={" << VF; - for (VF *= 2; VF < Range.End; VF *= 2) { - Plan->addVF(VF); - RSO << "," << VF; - } - RSO << "},UF>=1"; - RSO.flush(); - Plan->setName(PlanName); - - return Plan; -} - -void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { - O << " +\n" - << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; - IG->getInsertPos()->printAsOperand(O, false); - O << "\\l\""; - for (unsigned i = 0; i < IG->getFactor(); ++i) - if (Instruction *I = IG->getMember(i)) - O << " +\n" - << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; -} - -void VPReplicateRecipe::execute(VPTransformState &State) { - - if (State.Instance) { // Generate a single instance. - State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); - if (AlsoPack) { // Insert scalar instance packing it into a vector. - // If we're constructing lane 0, initialize to start from undef. - if (State.Instance->Lane == 0) { - Value *Undef = - UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); - State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); - } - State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); - } - return; - } - - // Generate scalar instances for all VF lanes of all UF parts, unless the - // instruction is uniform inwhich case generate only the first lane for each - // of the UF parts. - unsigned EndLane = IsUniform ? 1 : State.VF; - for (unsigned Part = 0; Part < State.UF; ++Part) - for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); - - // Broadcast or group together all instances into a vector. - if (AlsoPack) - for (unsigned Part = 0; Part < State.UF; ++Part) - State.ILV->getOrCreateVectorValue(Ingredient, Part); -} - -void VPBranchOnMaskRecipe::execute(VPTransformState &State) { - assert(State.Instance && "Branch on Mask works only on single instance."); - - unsigned Part = State.Instance->Part; - unsigned Lane = State.Instance->Lane; - - auto Cond = State.ILV->createBlockInMask(MaskedBasicBlock); - - Value *ConditionBit = Cond[Part]; - if (!ConditionBit) // Block in mask is all-one. - ConditionBit = State.Builder.getTrue(); - else if (ConditionBit->getType()->isVectorTy()) - ConditionBit = State.Builder.CreateExtractElement( - ConditionBit, State.Builder.getInt32(Lane)); - - // Replace the temporary unreachable terminator with a new conditional branch, - // whose two destinations will be set later when they are created. - auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); - assert(isa(CurrentTerminator) && - "Expected to replace unreachable terminator with conditional branch."); - auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); - CondBr->setSuccessor(0, nullptr); - ReplaceInstWithInst(CurrentTerminator, CondBr); - - DEBUG(dbgs() << "\nLV: vectorizing BranchOnMask recipe " - << MaskedBasicBlock->getName()); -} - -void VPPredInstPHIRecipe::execute(VPTransformState &State) { - assert(State.Instance && "Predicated instruction PHI works per instance."); - Instruction *ScalarPredInst = cast( - State.ValueMap.getScalarValue(PredInst, *State.Instance)); - BasicBlock *PredicatedBB = ScalarPredInst->getParent(); - BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); - assert(PredicatingBB && "Predicated block has no single predecessor."); - - // By current pack/unpack logic we need to generate only a single phi node: if - // a vector value for the predicated instruction exists at this point it means - // the instruction has vector users only, and a phi for the vector value is - // needed. In this case the recipe of the predicated instruction is marked to - // also do that packing, thereby "hoisting" the insert-element sequence. - // Otherwise, a phi node for the scalar value is needed. - unsigned Part = State.Instance->Part; - if (State.ValueMap.hasVectorValue(PredInst, Part)) { - Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); - InsertElementInst *IEI = cast(VectorValue); - PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); - VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. - VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. - State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. - } else { - Type *PredInstType = PredInst->getType(); - PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); - Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); - Phi->addIncoming(ScalarPredInst, PredicatedBB); - State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); - } -} - bool LoopVectorizePass::processLoop(Loop *L) { assert(L->empty() && "Only process inner loops."); @@ -8437,7 +7902,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); + LoopVectorizationPlanner LVP(L, LI, &LVL, CM); // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); @@ -8524,8 +7989,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } - LVP.setBestPlan(VF.Width, IC); - using namespace ore; if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); @@ -8533,7 +7996,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM); - LVP.executePlan(Unroller, DT); + LVP.executePlan(Unroller); ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), L->getHeader()) @@ -8543,7 +8006,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM); - LVP.executePlan(LB, DT); + LVP.executePlan(LB); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there are diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp deleted file mode 100644 index 498f4c4f7f31..000000000000 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ /dev/null @@ -1,401 +0,0 @@ -//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This is the LLVM vectorization plan. It represents a candidate for -/// vectorization, allowing to plan and optimize how to vectorize a given loop -/// before generating LLVM-IR. -/// The vectorizer uses vectorization plans to estimate the costs of potential -/// candidates and if profitable to execute the desired plan, generating vector -/// LLVM-IR code. -/// -//===----------------------------------------------------------------------===// - -#include "VPlan.h" -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/Dominators.h" -#include "llvm/Support/GraphWriter.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" - -using namespace llvm; - -#define DEBUG_TYPE "vplan" - -/// \return the VPBasicBlock that is the entry of Block, possibly indirectly. -const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const { - const VPBlockBase *Block = this; - while (const VPRegionBlock *Region = dyn_cast(Block)) - Block = Region->getEntry(); - return cast(Block); -} - -VPBasicBlock *VPBlockBase::getEntryBasicBlock() { - VPBlockBase *Block = this; - while (VPRegionBlock *Region = dyn_cast(Block)) - Block = Region->getEntry(); - return cast(Block); -} - -/// \return the VPBasicBlock that is the exit of Block, possibly indirectly. -const VPBasicBlock *VPBlockBase::getExitBasicBlock() const { - const VPBlockBase *Block = this; - while (const VPRegionBlock *Region = dyn_cast(Block)) - Block = Region->getExit(); - return cast(Block); -} - -VPBasicBlock *VPBlockBase::getExitBasicBlock() { - VPBlockBase *Block = this; - while (VPRegionBlock *Region = dyn_cast(Block)) - Block = Region->getExit(); - return cast(Block); -} - -VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() { - if (!Successors.empty() || !Parent) - return this; - assert(Parent->getExit() == this && - "Block w/o successors not the exit of its parent."); - return Parent->getEnclosingBlockWithSuccessors(); -} - -VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { - if (!Predecessors.empty() || !Parent) - return this; - assert(Parent->getEntry() == this && - "Block w/o predecessors not the entry of its parent."); - return Parent->getEnclosingBlockWithPredecessors(); -} - -void VPBlockBase::deleteCFG(VPBlockBase *Entry) { - SmallVector Blocks; - for (VPBlockBase *Block : depth_first(Entry)) - Blocks.push_back(Block); - - for (VPBlockBase *Block : Blocks) - delete Block; -} - -BasicBlock * -VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { - // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks. - // Pred stands for Predessor. Prev stands for Previous - last visited/created. - BasicBlock *PrevBB = CFG.PrevBB; - BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(), - PrevBB->getParent(), CFG.LastBB); - DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n'); - - // Hook up the new basic block to its predecessors. - for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { - VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock(); - auto &PredVPSuccessors = PredVPBB->getSuccessors(); - BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; - assert(PredBB && "Predecessor basic-block not found building successor."); - auto *PredBBTerminator = PredBB->getTerminator(); - DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); - if (isa(PredBBTerminator)) { - assert(PredVPSuccessors.size() == 1 && - "Predecessor ending w/o branch must have single successor."); - PredBBTerminator->eraseFromParent(); - BranchInst::Create(NewBB, PredBB); - } else { - assert(PredVPSuccessors.size() == 2 && - "Predecessor ending with branch must have two successors."); - unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; - assert(!PredBBTerminator->getSuccessor(idx) && - "Trying to reset an existing successor block."); - PredBBTerminator->setSuccessor(idx, NewBB); - } - } - return NewBB; -} - -void VPBasicBlock::execute(VPTransformState *State) { - bool Replica = State->Instance && - !(State->Instance->Part == 0 && State->Instance->Lane == 0); - VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB; - VPBlockBase *SingleHPred = nullptr; - BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. - - // 1. Create an IR basic block, or reuse the last one if possible. - // The last IR basic block is reused, as an optimization, in three cases: - // A. the first VPBB reuses the loop header BB - when PrevVPBB is null; - // B. when the current VPBB has a single (hierarchical) predecessor which - // is PrevVPBB and the latter has a single (hierarchical) successor; and - // C. when the current VPBB is an entry of a region replica - where PrevVPBB - // is the exit of this region from a previous instance, or the predecessor - // of this region. - if (PrevVPBB && /* A */ - !((SingleHPred = getSingleHierarchicalPredecessor()) && - SingleHPred->getExitBasicBlock() == PrevVPBB && - PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */ - !(Replica && getPredecessors().empty())) { /* C */ - - NewBB = createEmptyBasicBlock(State->CFG); - State->Builder.SetInsertPoint(NewBB); - // Temporarily terminate with unreachable until CFG is rewired. - UnreachableInst *Terminator = State->Builder.CreateUnreachable(); - State->Builder.SetInsertPoint(Terminator); - // Register NewBB in its loop. In innermost loops its the same for all BB's. - Loop *L = State->LI->getLoopFor(State->CFG.LastBB); - L->addBasicBlockToLoop(NewBB, *State->LI); - State->CFG.PrevBB = NewBB; - } - - // 2. Fill the IR basic block with IR instructions. - DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName() - << " in BB:" << NewBB->getName() << '\n'); - - State->CFG.VPBB2IRBB[this] = NewBB; - State->CFG.PrevVPBB = this; - - for (VPRecipeBase &Recipe : Recipes) - Recipe.execute(*State); - - DEBUG(dbgs() << "LV: filled BB:" << *NewBB); -} - -void VPRegionBlock::execute(VPTransformState *State) { - ReversePostOrderTraversal RPOT(Entry); - - if (!isReplicator()) { - // Visit the VPBlocks connected to "this", starting from it. - for (VPBlockBase *Block : RPOT) { - DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); - Block->execute(State); - } - return; - } - - assert(!State->Instance && "Replicating a Region with non-null instance."); - - // Enter replicating mode. - State->Instance = {0, 0}; - - for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { - State->Instance->Part = Part; - for (unsigned Lane = 0, VF = State->VF; Lane < VF; ++Lane) { - State->Instance->Lane = Lane; - // Visit the VPBlocks connected to \p this, starting from it. - for (VPBlockBase *Block : RPOT) { - DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); - Block->execute(State); - } - } - } - - // Exit replicating mode. - State->Instance.reset(); -} - -/// Generate the code inside the body of the vectorized loop. Assumes a single -/// LoopVectorBody basic-block was created for this. Introduce additional -/// basic-blocks as needed, and fill them all. -void VPlan::execute(VPTransformState *State) { - BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB; - BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor(); - assert(VectorHeaderBB && "Loop preheader does not have a single successor."); - BasicBlock *VectorLatchBB = VectorHeaderBB; - - // 1. Make room to generate basic-blocks inside loop body if needed. - VectorLatchBB = VectorHeaderBB->splitBasicBlock( - VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch"); - Loop *L = State->LI->getLoopFor(VectorHeaderBB); - L->addBasicBlockToLoop(VectorLatchBB, *State->LI); - // Remove the edge between Header and Latch to allow other connections. - // Temporarily terminate with unreachable until CFG is rewired. - // Note: this asserts the generated code's assumption that - // getFirstInsertionPt() can be dereferenced into an Instruction. - VectorHeaderBB->getTerminator()->eraseFromParent(); - State->Builder.SetInsertPoint(VectorHeaderBB); - UnreachableInst *Terminator = State->Builder.CreateUnreachable(); - State->Builder.SetInsertPoint(Terminator); - - // 2. Generate code in loop body. - State->CFG.PrevVPBB = nullptr; - State->CFG.PrevBB = VectorHeaderBB; - State->CFG.LastBB = VectorLatchBB; - - for (VPBlockBase *Block : depth_first(Entry)) - Block->execute(State); - - // 3. Merge the temporary latch created with the last basic-block filled. - BasicBlock *LastBB = State->CFG.PrevBB; - // Connect LastBB to VectorLatchBB to facilitate their merge. - assert(isa(LastBB->getTerminator()) && - "Expected VPlan CFG to terminate with unreachable"); - LastBB->getTerminator()->eraseFromParent(); - BranchInst::Create(VectorLatchBB, LastBB); - - // Merge LastBB with Latch. - bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI); - (void)Merged; - assert(Merged && "Could not merge last basic block with latch."); - VectorLatchBB = LastBB; - - updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB); -} - -void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, - BasicBlock *LoopLatchBB) { - BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor(); - assert(LoopHeaderBB && "Loop preheader does not have a single successor."); - DT->addNewBlock(LoopHeaderBB, LoopPreHeaderBB); - // The vector body may be more than a single basic-block by this point. - // Update the dominator tree information inside the vector body by propagating - // it from header to latch, expecting only triangular control-flow, if any. - BasicBlock *PostDomSucc = nullptr; - for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) { - // Get the list of successors of this block. - std::vector Succs(succ_begin(BB), succ_end(BB)); - assert(Succs.size() <= 2 && - "Basic block in vector loop has more than 2 successors."); - PostDomSucc = Succs[0]; - if (Succs.size() == 1) { - assert(PostDomSucc->getSinglePredecessor() && - "PostDom successor has more than one predecessor."); - DT->addNewBlock(PostDomSucc, BB); - continue; - } - BasicBlock *InterimSucc = Succs[1]; - if (PostDomSucc->getSingleSuccessor() == InterimSucc) { - PostDomSucc = Succs[1]; - InterimSucc = Succs[0]; - } - assert(InterimSucc->getSingleSuccessor() == PostDomSucc && - "One successor of a basic block does not lead to the other."); - assert(InterimSucc->getSinglePredecessor() && - "Interim successor has more than one predecessor."); - assert(std::distance(pred_begin(PostDomSucc), pred_end(PostDomSucc)) == 2 && - "PostDom successor has more than two predecessors."); - DT->addNewBlock(InterimSucc, BB); - DT->addNewBlock(PostDomSucc, BB); - } -} - -const Twine VPlanPrinter::getUID(const VPBlockBase *Block) { - return (isa(Block) ? "cluster_N" : "N") + - Twine(getOrCreateBID(Block)); -} - -const Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) { - const std::string &Name = Block->getName(); - if (!Name.empty()) - return Name; - return "VPB" + Twine(getOrCreateBID(Block)); -} - -void VPlanPrinter::dump() { - Depth = 1; - bumpIndent(0); - OS << "digraph VPlan {\n"; - OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan"; - if (!Plan.getName().empty()) - OS << "\\n" << DOT::EscapeString(Plan.getName()); - OS << "\"]\n"; - OS << "node [shape=rect, fontname=Courier, fontsize=30]\n"; - OS << "edge [fontname=Courier, fontsize=30]\n"; - OS << "compound=true\n"; - - for (VPBlockBase *Block : depth_first(Plan.getEntry())) - dumpBlock(Block); - - OS << "}\n"; -} - -void VPlanPrinter::dumpBlock(const VPBlockBase *Block) { - if (const VPBasicBlock *BasicBlock = dyn_cast(Block)) - dumpBasicBlock(BasicBlock); - else if (const VPRegionBlock *Region = dyn_cast(Block)) - dumpRegion(Region); - else - llvm_unreachable("Unsupported kind of VPBlock."); -} - -void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To, - bool Hidden, const Twine &Label) { - // Due to "dot" we print an edge between two regions as an edge between the - // exit basic block and the entry basic of the respective regions. - const VPBlockBase *Tail = From->getExitBasicBlock(); - const VPBlockBase *Head = To->getEntryBasicBlock(); - OS << Indent << getUID(Tail) << " -> " << getUID(Head); - OS << " [ label=\"" << Label << '\"'; - if (Tail != From) - OS << " ltail=" << getUID(From); - if (Head != To) - OS << " lhead=" << getUID(To); - if (Hidden) - OS << "; splines=none"; - OS << "]\n"; -} - -void VPlanPrinter::dumpEdges(const VPBlockBase *Block) { - auto &Successors = Block->getSuccessors(); - if (Successors.size() == 1) - drawEdge(Block, Successors.front(), false, ""); - else if (Successors.size() == 2) { - drawEdge(Block, Successors.front(), false, "T"); - drawEdge(Block, Successors.back(), false, "F"); - } else { - unsigned SuccessorNumber = 0; - for (auto *Successor : Successors) - drawEdge(Block, Successor, false, Twine(SuccessorNumber++)); - } -} - -void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { - OS << Indent << getUID(BasicBlock) << " [label =\n"; - bumpIndent(1); - OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\""; - bumpIndent(1); - for (const VPRecipeBase &Recipe : *BasicBlock) - Recipe.print(OS, Indent); - bumpIndent(-2); - OS << "\n" << Indent << "]\n"; - dumpEdges(BasicBlock); -} - -void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { - OS << Indent << "subgraph " << getUID(Region) << " {\n"; - bumpIndent(1); - OS << Indent << "fontname=Courier\n" - << Indent << "label=\"" - << DOT::EscapeString(Region->isReplicator() ? " " : " ") - << DOT::EscapeString(Region->getName()) << "\"\n"; - // Dump the blocks of the region. - assert(Region->getEntry() && "Region contains no inner blocks."); - for (const VPBlockBase *Block : depth_first(Region->getEntry())) - dumpBlock(Block); - bumpIndent(-1); - OS << Indent << "}\n"; - dumpEdges(Region); -} - -void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) { - std::string IngredientString; - raw_string_ostream RSO(IngredientString); - if (auto *Inst = dyn_cast(V)) { - if (!Inst->getType()->isVoidTy()) { - Inst->printAsOperand(RSO, false); - RSO << " = "; - } - RSO << Inst->getOpcodeName() << " "; - unsigned E = Inst->getNumOperands(); - if (E > 0) { - Inst->getOperand(0)->printAsOperand(RSO, false); - for (unsigned I = 1; I < E; ++I) - Inst->getOperand(I)->printAsOperand(RSO << ", ", false); - } - } else // !Inst - V->printAsOperand(RSO, false); - RSO.flush(); - O << DOT::EscapeString(IngredientString); -} diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h deleted file mode 100644 index 3c11fdeb0763..000000000000 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ /dev/null @@ -1,789 +0,0 @@ -//===- VPlan.h - Represent A Vectorizer Plan ------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file contains the declarations of the Vectorization Plan base classes: -/// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual -/// VPBlockBase, together implementing a Hierarchical CFG; -/// 2. Specializations of GraphTraits that allow VPBlockBase graphs to be -/// treated as proper graphs for generic algorithms; -/// 3. Pure virtual VPRecipeBase serving as the base class for recipes contained -/// within VPBasicBlocks; -/// 4. The VPlan class holding a candidate for vectorization; -/// 5. The VPlanPrinter class providing a way to print a plan in dot format. -/// These are documented in docs/VectorizationPlan.rst. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H -#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H - -#include "llvm/ADT/GraphTraits.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/ilist.h" -#include "llvm/ADT/ilist_node.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/Support/raw_ostream.h" - -// The (re)use of existing LoopVectorize classes is subject to future VPlan -// refactoring. -namespace { -// Forward declarations. -//class InnerLoopVectorizer; -class LoopVectorizationLegality; -class LoopVectorizationCostModel; -} // namespace - -namespace llvm { - -// Forward declarations. -class BasicBlock; -class InnerLoopVectorizer; -class VPBasicBlock; - -/// In what follows, the term "input IR" refers to code that is fed into the -/// vectorizer whereas the term "output IR" refers to code that is generated by -/// the vectorizer. - -/// VPIteration represents a single point in the iteration space of the output -/// (vectorized and/or unrolled) IR loop. -struct VPIteration { - unsigned Part; ///< in [0..UF) - unsigned Lane; ///< in [0..VF) -}; - -/// This is a helper struct for maintaining vectorization state. It's used for -/// mapping values from the original loop to their corresponding values in -/// the new loop. Two mappings are maintained: one for vectorized values and -/// one for scalarized values. Vectorized values are represented with UF -/// vector values in the new loop, and scalarized values are represented with -/// UF x VF scalar values in the new loop. UF and VF are the unroll and -/// vectorization factors, respectively. -/// -/// Entries can be added to either map with setVectorValue and setScalarValue, -/// which assert that an entry was not already added before. If an entry is to -/// replace an existing one, call resetVectorValue and resetScalarValue. This is -/// currently needed to modify the mapped values during "fix-up" operations that -/// occur once the first phase of widening is complete. These operations include -/// type truncation and the second phase of recurrence widening. -/// -/// Entries from either map can be retrieved using the getVectorValue and -/// getScalarValue functions, which assert that the desired value exists. - -struct VectorizerValueMap { -private: - /// The unroll factor. Each entry in the vector map contains UF vector values. - unsigned UF; - - /// The vectorization factor. Each entry in the scalar map contains UF x VF - /// scalar values. - unsigned VF; - - /// The vector and scalar map storage. We use std::map and not DenseMap - /// because insertions to DenseMap invalidate its iterators. - typedef SmallVector VectorParts; - typedef SmallVector, 2> ScalarParts; - std::map VectorMapStorage; - std::map ScalarMapStorage; - -public: - /// Construct an empty map with the given unroll and vectorization factors. - VectorizerValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} - - /// \return True if the map has any vector entry for \p Key. - bool hasAnyVectorValue(Value *Key) const { - return VectorMapStorage.count(Key); - } - - /// \return True if the map has a vector entry for \p Key and \p Part. - bool hasVectorValue(Value *Key, unsigned Part) const { - assert(Part < UF && "Queried Vector Part is too large."); - if (!hasAnyVectorValue(Key)) - return false; - const VectorParts &Entry = VectorMapStorage.find(Key)->second; - assert(Entry.size() == UF && "VectorParts has wrong dimensions."); - return Entry[Part] != nullptr; - } - - /// \return True if the map has any scalar entry for \p Key. - bool hasAnyScalarValue(Value *Key) const { - return ScalarMapStorage.count(Key); - } - - /// \return True if the map has a scalar entry for \p Key and \p Instance. - bool hasScalarValue(Value *Key, const VPIteration &Instance) const { - assert(Instance.Part < UF && "Queried Scalar Part is too large."); - assert(Instance.Lane < VF && "Queried Scalar Lane is too large."); - if (!hasAnyScalarValue(Key)) - return false; - const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; - assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Instance.Part].size() == VF && - "ScalarParts has wrong dimensions."); - return Entry[Instance.Part][Instance.Lane] != nullptr; - } - - /// Retrieve the existing vector value that corresponds to \p Key and - /// \p Part. - Value *getVectorValue(Value *Key, unsigned Part) { - assert(hasVectorValue(Key, Part) && "Getting non-existent value."); - return VectorMapStorage[Key][Part]; - } - - /// Retrieve the existing scalar value that corresponds to \p Key and - /// \p Instance. - Value *getScalarValue(Value *Key, const VPIteration &Instance) { - assert(hasScalarValue(Key, Instance) && "Getting non-existent value."); - return ScalarMapStorage[Key][Instance.Part][Instance.Lane]; - } - - /// Set a vector value associated with \p Key and \p Part. Assumes such a - /// value is not already set. If it is, use resetVectorValue() instead. - void setVectorValue(Value *Key, unsigned Part, Value *Vector) { - assert(!hasVectorValue(Key, Part) && "Vector value already set for part"); - if (!VectorMapStorage.count(Key)) { - VectorParts Entry(UF); - VectorMapStorage[Key] = Entry; - } - VectorMapStorage[Key][Part] = Vector; - } - - /// Set a scalar value associated with \p Key and \p Instance. Assumes such a - /// value is not already set. - void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar) { - assert(!hasScalarValue(Key, Instance) && "Scalar value already set"); - if (!ScalarMapStorage.count(Key)) { - ScalarParts Entry(UF); - // TODO: Consider storing uniform values only per-part, as they occupy - // lane 0 only, keeping the other VF-1 redundant entries null. - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF, nullptr); - ScalarMapStorage[Key] = Entry; - } - ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; - } - - /// Reset the vector value associated with \p Key for the given \p Part. - /// This function can be used to update values that have already been - /// vectorized. This is the case for "fix-up" operations including type - /// truncation and the second phase of recurrence vectorization. - void resetVectorValue(Value *Key, unsigned Part, Value *Vector) { - assert(hasVectorValue(Key, Part) && "Vector value not set for part"); - VectorMapStorage[Key][Part] = Vector; - } - - /// Reset the scalar value associated with \p Key for \p Part and \p Lane. - /// This function can be used to update values that have already been - /// scalarized. This is the case for "fix-up" operations including scalar phi - /// nodes for scalarized and predicated instructions. - void resetScalarValue(Value *Key, const VPIteration &Instance, - Value *Scalar) { - assert(hasScalarValue(Key, Instance) && - "Scalar value not set for part and lane"); - ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; - } -}; - -/// VPTransformState holds information passed down when "executing" a VPlan, -/// needed for generating the output IR. -struct VPTransformState { - - VPTransformState(unsigned VF, unsigned UF, class LoopInfo *LI, - class DominatorTree *DT, IRBuilder<> &Builder, - VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV) - : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), - ValueMap(ValueMap), ILV(ILV) {} - - /// The chosen Vectorization and Unroll Factors of the loop being vectorized. - unsigned VF; - unsigned UF; - - /// Hold the indices to generate specific scalar instructions. Null indicates - /// that all instances are to be generated, using either scalar or vector - /// instructions. - Optional Instance; - - /// Hold state information used when constructing the CFG of the output IR, - /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. - struct CFGState { - /// The previous VPBasicBlock visited. Initially set to null. - VPBasicBlock *PrevVPBB; - /// The previous IR BasicBlock created or used. Initially set to the new - /// header BasicBlock. - BasicBlock *PrevBB; - /// The last IR BasicBlock in the output IR. Set to the new latch - /// BasicBlock, used for placing the newly created BasicBlocks. - BasicBlock *LastBB; - /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case - /// of replication, maps the BasicBlock of the last replica created. - SmallDenseMap VPBB2IRBB; - - CFGState() : PrevVPBB(nullptr), PrevBB(nullptr), LastBB(nullptr) {} - } CFG; - - /// Hold a pointer to LoopInfo to register new basic blocks in the loop. - class LoopInfo *LI; - - /// Hold a pointer to Dominator Tree to register new basic blocks in the loop. - class DominatorTree *DT; - - /// Hold a reference to the IRBuilder used to generate output IR code. - IRBuilder<> &Builder; - - /// Hold a reference to the Value state information used when generating the - /// Values of the output IR. - VectorizerValueMap &ValueMap; - - /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. - class InnerLoopVectorizer *ILV; -}; - -/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. -/// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock. -class VPBlockBase { -private: - const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). - - /// An optional name for the block. - std::string Name; - - /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if - /// it is a topmost VPBlockBase. - class VPRegionBlock *Parent; - - /// List of predecessor blocks. - SmallVector Predecessors; - - /// List of successor blocks. - SmallVector Successors; - - /// Add \p Successor as the last successor to this block. - void appendSuccessor(VPBlockBase *Successor) { - assert(Successor && "Cannot add nullptr successor!"); - Successors.push_back(Successor); - } - - /// Add \p Predecessor as the last predecessor to this block. - void appendPredecessor(VPBlockBase *Predecessor) { - assert(Predecessor && "Cannot add nullptr predecessor!"); - Predecessors.push_back(Predecessor); - } - - /// Remove \p Predecessor from the predecessors of this block. - void removePredecessor(VPBlockBase *Predecessor) { - auto Pos = std::find(Predecessors.begin(), Predecessors.end(), Predecessor); - assert(Pos && "Predecessor does not exist"); - Predecessors.erase(Pos); - } - - /// Remove \p Successor from the successors of this block. - void removeSuccessor(VPBlockBase *Successor) { - auto Pos = std::find(Successors.begin(), Successors.end(), Successor); - assert(Pos && "Successor does not exist"); - Successors.erase(Pos); - } - -protected: - VPBlockBase(const unsigned char SC, const std::string &N) - : SubclassID(SC), Name(N), Parent(nullptr) {} - -public: - /// An enumeration for keeping track of the concrete subclass of VPBlockBase - /// that are actually instantiated. Values of this enumeration are kept in the - /// SubclassID field of the VPBlockBase objects. They are used for concrete - /// type identification. - typedef enum { VPBasicBlockSC, VPRegionBlockSC } VPBlockTy; - - typedef SmallVectorImpl VPBlocksTy; - - virtual ~VPBlockBase() {} - - const std::string &getName() const { return Name; } - - void setName(const Twine &newName) { Name = newName.str(); } - - /// \return an ID for the concrete type of this object. - /// This is used to implement the classof checks. This should not be used - /// for any other purpose, as the values may change as LLVM evolves. - unsigned getVPBlockID() const { return SubclassID; } - - const VPRegionBlock *getParent() const { return Parent; } - - void setParent(VPRegionBlock *P) { Parent = P; } - - /// \return the VPBasicBlock that is the entry of this VPBlockBase, - /// recursively, if the latter is a VPRegionBlock. Otherwise, if this - /// VPBlockBase is a VPBasicBlock, it is returned. - const VPBasicBlock *getEntryBasicBlock() const; - VPBasicBlock *getEntryBasicBlock(); - - /// \return the VPBasicBlock that is the exit of this VPBlockBase, - /// recursively, if the latter is a VPRegionBlock. Otherwise, if this - /// VPBlockBase is a VPBasicBlock, it is returned. - const VPBasicBlock *getExitBasicBlock() const; - VPBasicBlock *getExitBasicBlock(); - - const VPBlocksTy &getSuccessors() const { return Successors; } - VPBlocksTy &getSuccessors() { return Successors; } - - const VPBlocksTy &getPredecessors() const { return Predecessors; } - VPBlocksTy &getPredecessors() { return Predecessors; } - - /// \return the successor of this VPBlockBase if it has a single successor. - /// Otherwise return a null pointer. - VPBlockBase *getSingleSuccessor() const { - return (Successors.size() == 1 ? *Successors.begin() : nullptr); - } - - /// \return the predecessor of this VPBlockBase if it has a single - /// predecessor. Otherwise return a null pointer. - VPBlockBase *getSinglePredecessor() const { - return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr); - } - - /// An Enclosing Block of a block B is any block containing B, including B - /// itself. \return the closest enclosing block starting from "this", which - /// has successors. \return the root enclosing block if all enclosing blocks - /// have no successors. - VPBlockBase *getEnclosingBlockWithSuccessors(); - - /// \return the closest enclosing block starting from "this", which has - /// predecessors. \return the root enclosing block if all enclosing blocks - /// have no predecessors. - VPBlockBase *getEnclosingBlockWithPredecessors(); - - /// \return the successors either attached directly to this VPBlockBase or, if - /// this VPBlockBase is the exit block of a VPRegionBlock and has no - /// successors of its own, search recursively for the first enclosing - /// VPRegionBlock that has successors and return them. If no such - /// VPRegionBlock exists, return the (empty) successors of the topmost - /// VPBlockBase reached. - const VPBlocksTy &getHierarchicalSuccessors() { - return getEnclosingBlockWithSuccessors()->getSuccessors(); - } - - /// \return the hierarchical successor of this VPBlockBase if it has a single - /// hierarchical successor. Otherwise return a null pointer. - VPBlockBase *getSingleHierarchicalSuccessor() { - return getEnclosingBlockWithSuccessors()->getSingleSuccessor(); - } - - /// \return the predecessors either attached directly to this VPBlockBase or, - /// if this VPBlockBase is the entry block of a VPRegionBlock and has no - /// predecessors of its own, search recursively for the first enclosing - /// VPRegionBlock that has predecessors and return them. If no such - /// VPRegionBlock exists, return the (empty) predecessors of the topmost - /// VPBlockBase reached. - const VPBlocksTy &getHierarchicalPredecessors() { - return getEnclosingBlockWithPredecessors()->getPredecessors(); - } - - /// \return the hierarchical predecessor of this VPBlockBase if it has a - /// single hierarchical predecessor. Otherwise return a null pointer. - VPBlockBase *getSingleHierarchicalPredecessor() { - return getEnclosingBlockWithPredecessors()->getSinglePredecessor(); - } - - /// Sets a given VPBlockBase \p Successor as the single successor and \return - /// \p Successor. The parent of this Block is copied to be the parent of - /// \p Successor. - VPBlockBase *setOneSuccessor(VPBlockBase *Successor) { - assert(Successors.empty() && "Setting one successor when others exist."); - appendSuccessor(Successor); - Successor->appendPredecessor(this); - Successor->Parent = Parent; - return Successor; - } - - /// Sets two given VPBlockBases \p IfTrue and \p IfFalse to be the two - /// successors. The parent of this Block is copied to be the parent of both - /// \p IfTrue and \p IfFalse. - void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) { - assert(Successors.empty() && "Setting two successors when others exist."); - appendSuccessor(IfTrue); - appendSuccessor(IfFalse); - IfTrue->appendPredecessor(this); - IfFalse->appendPredecessor(this); - IfTrue->Parent = Parent; - IfFalse->Parent = Parent; - } - - void disconnectSuccessor(VPBlockBase *Successor) { - assert(Successor && "Successor to disconnect is null."); - removeSuccessor(Successor); - Successor->removePredecessor(this); - } - - /// The method which generates the output IR that correspond to this - /// VPBlockBase, thereby "executing" the VPlan. - virtual void execute(struct VPTransformState *State) = 0; - - /// Delete all blocks reachable from a given VPBlockBase, inclusive. - static void deleteCFG(VPBlockBase *Entry); -}; - -/// VPRecipeBase is a base class modeling a sequence of one or more output IR -/// instructions. -class VPRecipeBase : public ilist_node_with_parent { - friend VPBasicBlock; - -private: - const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). - - /// Each VPRecipe belongs to a single VPBasicBlock. - VPBasicBlock *Parent; - -public: - /// An enumeration for keeping track of the concrete subclass of VPRecipeBase - /// that is actually instantiated. Values of this enumeration are kept in the - /// SubclassID field of the VPRecipeBase objects. They are used for concrete - /// type identification. - typedef enum { - VPBranchOnMaskSC, - VPInterleaveSC, - VPPredInstPHISC, - VPReplicateSC, - VPWidenIntOrFpInductionSC, - VPWidenPHISC, - VPWidenSC, - } VPRecipeTy; - - VPRecipeBase(const unsigned char SC) : SubclassID(SC), Parent(nullptr) {} - - virtual ~VPRecipeBase() {} - - /// \return an ID for the concrete type of this object. - /// This is used to implement the classof checks. This should not be used - /// for any other purpose, as the values may change as LLVM evolves. - unsigned getVPRecipeID() const { return SubclassID; } - - /// \return the VPBasicBlock which this VPRecipe belongs to. - VPBasicBlock *getParent() { return Parent; } - const VPBasicBlock *getParent() const { return Parent; } - - /// The method which generates the output IR instructions that correspond to - /// this VPRecipe, thereby "executing" the VPlan. - virtual void execute(struct VPTransformState &State) = 0; - - /// Each recipe prints itself. - virtual void print(raw_ostream &O, const Twine &Indent) const = 0; -}; - -/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It -/// holds a sequence of zero or more VPRecipe's each representing a sequence of -/// output IR instructions. -class VPBasicBlock : public VPBlockBase { -public: - typedef iplist RecipeListTy; - -private: - /// The VPRecipes held in the order of output instructions to generate. - RecipeListTy Recipes; - -public: - /// Instruction iterators... - typedef RecipeListTy::iterator iterator; - typedef RecipeListTy::const_iterator const_iterator; - typedef RecipeListTy::reverse_iterator reverse_iterator; - typedef RecipeListTy::const_reverse_iterator const_reverse_iterator; - - //===--------------------------------------------------------------------===// - /// Recipe iterator methods - /// - inline iterator begin() { return Recipes.begin(); } - inline const_iterator begin() const { return Recipes.begin(); } - inline iterator end() { return Recipes.end(); } - inline const_iterator end() const { return Recipes.end(); } - - inline reverse_iterator rbegin() { return Recipes.rbegin(); } - inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); } - inline reverse_iterator rend() { return Recipes.rend(); } - inline const_reverse_iterator rend() const { return Recipes.rend(); } - - inline size_t size() const { return Recipes.size(); } - inline bool empty() const { return Recipes.empty(); } - inline const VPRecipeBase &front() const { return Recipes.front(); } - inline VPRecipeBase &front() { return Recipes.front(); } - inline const VPRecipeBase &back() const { return Recipes.back(); } - inline VPRecipeBase &back() { return Recipes.back(); } - - /// \brief Returns a pointer to a member of the recipe list. - static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) { - return &VPBasicBlock::Recipes; - } - - VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr) - : VPBlockBase(VPBasicBlockSC, Name.str()) { - if (Recipe) - appendRecipe(Recipe); - } - - ~VPBasicBlock() { Recipes.clear(); } - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPBlockBase *V) { - return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC; - } - - /// Augment the existing recipes of a VPBasicBlock with an additional - /// \p Recipe as the last recipe. - void appendRecipe(VPRecipeBase *Recipe) { - assert(Recipe && "No recipe to append."); - assert(!Recipe->Parent && "Recipe already in VPlan"); - Recipe->Parent = this; - return Recipes.push_back(Recipe); - } - - /// The method which generates the output IR instructions that correspond to - /// this VPBasicBlock, thereby "executing" the VPlan. - void execute(struct VPTransformState *State) override; - -private: - /// Create an IR BasicBlock to hold the output instructions generated by this - /// VPBasicBlock, and return it. Update the CFGState accordingly. - BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG); -}; - -/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks -/// which form a Single-Entry-Single-Exit subgraph of the output IR CFG. -/// A VPRegionBlock may indicate that its contents are to be replicated several -/// times. This is designed to support predicated scalarization, in which a -/// scalar if-then code structure needs to be generated VF * UF times. Having -/// this replication indicator helps to keep a single model for multiple -/// candidate VF's. The actual replication takes place only once the desired VF -/// and UF have been determined. -class VPRegionBlock : public VPBlockBase { -private: - /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock. - VPBlockBase *Entry; - - /// Hold the Single Exit of the SESE region modelled by the VPRegionBlock. - VPBlockBase *Exit; - - /// An indicator whether this region is to generate multiple replicated - /// instances of output IR corresponding to its VPBlockBases. - bool IsReplicator; - -public: - VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exit, - const std::string &Name = "", bool IsReplicator = false) - : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exit(Exit), - IsReplicator(IsReplicator) { - assert(Entry->getPredecessors().empty() && "Entry block has predecessors."); - assert(Exit->getSuccessors().empty() && "Exit block has successors."); - Entry->setParent(this); - Exit->setParent(this); - } - - ~VPRegionBlock() { - if (Entry) - deleteCFG(Entry); - } - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPBlockBase *V) { - return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC; - } - - const VPBlockBase *getEntry() const { return Entry; } - VPBlockBase *getEntry() { return Entry; } - - const VPBlockBase *getExit() const { return Exit; } - VPBlockBase *getExit() { return Exit; } - - /// An indicator whether this region is to generate multiple replicated - /// instances of output IR corresponding to its VPBlockBases. - bool isReplicator() const { return IsReplicator; } - - /// The method which generates the output IR instructions that correspond to - /// this VPRegionBlock, thereby "executing" the VPlan. - void execute(struct VPTransformState *State) override; -}; - -/// VPlan models a candidate for vectorization, encoding various decisions take -/// to produce efficient output IR, including which branches, basic-blocks and -/// output IR instructions to generate, and their cost. VPlan holds a -/// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry -/// VPBlock. -class VPlan { -private: - /// Hold the single entry to the Hierarchical CFG of the VPlan. - VPBlockBase *Entry; - - /// Holds the VFs applicable to this VPlan. - SmallSet VFs; - - /// Holds the name of the VPlan, for printing. - std::string Name; - -public: - VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {} - - ~VPlan() { - if (Entry) - VPBlockBase::deleteCFG(Entry); - } - - /// Generate the IR code for this VPlan. - void execute(struct VPTransformState *State); - - VPBlockBase *getEntry() { return Entry; } - const VPBlockBase *getEntry() const { return Entry; } - - VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; } - - void addVF(unsigned VF) { VFs.insert(VF); } - - bool hasVF(unsigned VF) { return VFs.count(VF); } - - const std::string &getName() const { return Name; } - - void setName(const Twine &newName) { Name = newName.str(); } - -private: - /// Add to the given dominator tree the header block and every new basic block - /// that was created between it and the latch block, inclusive. - static void updateDominatorTree(class DominatorTree *DT, - BasicBlock *LoopPreHeaderBB, - BasicBlock *LoopLatchBB); -}; - -/// VPlanPrinter prints a given VPlan to a given output stream. The printing is -/// indented and follows the dot format. -class VPlanPrinter { - friend inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan); - friend inline raw_ostream &operator<<(raw_ostream &OS, - const struct VPlanIngredient &I); - -private: - raw_ostream &OS; - VPlan &Plan; - unsigned Depth; - unsigned TabWidth = 2; - std::string Indent; - - unsigned BID = 0; - - SmallDenseMap BlockID; - - /// Handle indentation. - void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } - - /// Print a given \p Block of the Plan. - void dumpBlock(const VPBlockBase *Block); - - /// Print the information related to the CFG edges going out of a given - /// \p Block, followed by printing the successor blocks themselves. - void dumpEdges(const VPBlockBase *Block); - - /// Print a given \p BasicBlock, including its VPRecipes, followed by printing - /// its successor blocks. - void dumpBasicBlock(const VPBasicBlock *BasicBlock); - - /// Print a given \p Region of the Plan. - void dumpRegion(const VPRegionBlock *Region); - - unsigned getOrCreateBID(const VPBlockBase *Block) { - return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++; - } - - const Twine getOrCreateName(const VPBlockBase *Block); - - const Twine getUID(const VPBlockBase *Block); - - /// Print the information related to a CFG edge between two VPBlockBases. - void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, - const Twine &Label); - - VPlanPrinter(raw_ostream &O, VPlan &P) : OS(O), Plan(P) {} - - void dump(); - - static void printAsIngredient(raw_ostream &O, Value *V); -}; - -struct VPlanIngredient { - Value *V; - VPlanIngredient(Value *V) : V(V) {} -}; - -inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { - VPlanPrinter::printAsIngredient(OS, I.V); - return OS; -} - -inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan) { - VPlanPrinter Printer(OS, Plan); - Printer.dump(); - return OS; -} - -//===--------------------------------------------------------------------===// -// GraphTraits specializations for VPlan/VPRegionBlock Control-Flow Graphs // -//===--------------------------------------------------------------------===// - -// Provide specializations of GraphTraits to be able to treat a VPBlockBase as a -// graph of VPBlockBase nodes... - -template <> struct GraphTraits { - typedef VPBlockBase *NodeRef; - typedef SmallVectorImpl::iterator ChildIteratorType; - - static NodeRef getEntryNode(NodeRef N) { return N; } - - static inline ChildIteratorType child_begin(NodeRef N) { - return N->getSuccessors().begin(); - } - - static inline ChildIteratorType child_end(NodeRef N) { - return N->getSuccessors().end(); - } -}; - -template <> struct GraphTraits { - typedef const VPBlockBase *NodeRef; - typedef SmallVectorImpl::const_iterator ChildIteratorType; - - static NodeRef getEntryNode(NodeRef N) { return N; } - - static inline ChildIteratorType child_begin(NodeRef N) { - return N->getSuccessors().begin(); - } - - static inline ChildIteratorType child_end(NodeRef N) { - return N->getSuccessors().end(); - } -}; - -// Provide specializations of GraphTraits to be able to treat a VPBlockBase as a -// graph of VPBlockBase nodes... and to walk it in inverse order. Inverse order -// for a VPBlockBase is considered to be when traversing the predecessors of a -// VPBlockBase instead of its successors. -// - -template <> struct GraphTraits> { - typedef VPBlockBase *NodeRef; - typedef SmallVectorImpl::iterator ChildIteratorType; - - static Inverse getEntryNode(Inverse B) { - return B; - } - - static inline ChildIteratorType child_begin(NodeRef N) { - return N->getPredecessors().begin(); - } - - static inline ChildIteratorType child_end(NodeRef N) { - return N->getPredecessors().end(); - } -}; - -} // namespace llvm - -#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll index eb12803a3447..37a6d4e79984 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -26,9 +26,9 @@ target triple = "aarch64--linux-gnu" ; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]] ; CHECK: [[PRED_UDIV_IF]]: ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], %x -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP5]], %x +; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE]] ; CHECK: [[PRED_UDIV_CONTINUE]]: @@ -37,9 +37,9 @@ target triple = "aarch64--linux-gnu" ; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]] ; CHECK: [[PRED_UDIV_IF1]]: ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP11]], %x -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP12]], %x +; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP13]] ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE2]] ; CHECK: [[PRED_UDIV_CONTINUE2]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll index b0ebb4edf2ad..3be086587412 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -18,8 +18,8 @@ target triple = "aarch64--linux-gnu" ; Cost of udiv: ; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 ; -; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 ; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 ; define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) { entry: @@ -59,8 +59,8 @@ for.end: ; Cost of store: ; (store(4) + extractelement(3)) / 2 = 3 ; -; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 ; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 ; define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -98,10 +98,10 @@ for.end: ; Cost of udiv: ; (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4 ; -; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x -; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x ; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 ; define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -143,10 +143,10 @@ for.end: ; Cost of store: ; store(4) / 2 = 2 ; -; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x -; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x +; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 ; define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -192,16 +192,16 @@ for.end: ; Cost of store: ; store(4) / 2 = 2 ; -; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x -; CHECK: Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2 -; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2 -; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x -; CHECK: Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4 ; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x ; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 ; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4 +; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x +; CHECK: Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2 +; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2 +; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x +; CHECK: Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4 ; define void @predication_multi_context(i32* %a, i1 %c, i32 %x, i64 %n) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll index 9fdf22ecd926..e7096c29b994 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll @@ -24,10 +24,10 @@ for.body: for.end: ret void -; CHECK: LV: Scalarizing: %tmp1 = load i32, i32* %tmp0, align 4 -; CHECK: LV: Scalarizing: store i32 %tmp2, i32* %tmp0, align 4 - ; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 ; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, i32* %tmp0, align 4 + +; CHECK: LV: Scalarizing: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: LV: Scalarizing: store i32 %tmp2, i32* %tmp0, align 4 } diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index bc9247f80717..3a8237ff84f7 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -467,6 +467,13 @@ for.body: ; SINK-AFTER: %[[VCONV:.+]] = sext <4 x i16> %[[VSHUF]] to <4 x i32> ; SINK-AFTER: %[[VCONV3:.+]] = sext <4 x i16> %wide.load to <4 x i32> ; SINK-AFTER: mul nsw <4 x i32> %[[VCONV3]], %[[VCONV]] +; Check also that the sext sank after the load in the scalar loop. +; SINK-AFTER: for.body +; SINK-AFTER: %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ %[[LOAD:.+]], %for.body ] +; SINK-AFTER: %[[LOAD]] = load i16, i16* %arrayidx2 +; SINK-AFTER: %[[CONV:.+]] = sext i16 %scalar.recur to i32 +; SINK-AFTER: %[[CONV3:.+]] = sext i16 %[[LOAD]] to i32 +; SINK-AFTER: %mul = mul nsw i32 %[[CONV3]], %[[CONV]] ; define void @sink_after(i16* %a, i32* %b, i64 %n) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll index c654b7994221..9765fdea9f02 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -209,9 +209,9 @@ entry: ; CHECK: br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]] ; CHECK: [[IF0]]: ; CHECK: %[[T00:.+]] = extractelement <2 x i32> %wide.load, i32 0 -; CHECK: %[[T01:.+]] = add nsw i32 %[[T00]], %x -; CHECK: %[[T02:.+]] = extractelement <2 x i32> %wide.load, i32 0 -; CHECK: %[[T03:.+]] = udiv i32 %[[T02]], %[[T01]] +; CHECK: %[[T01:.+]] = extractelement <2 x i32> %wide.load, i32 0 +; CHECK: %[[T02:.+]] = add nsw i32 %[[T01]], %x +; CHECK: %[[T03:.+]] = udiv i32 %[[T00]], %[[T02]] ; CHECK: %[[T04:.+]] = insertelement <2 x i32> undef, i32 %[[T03]], i32 0 ; CHECK: br label %[[CONT0]] ; CHECK: [[CONT0]]: @@ -219,9 +219,9 @@ entry: ; CHECK: br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]] ; CHECK: [[IF1]]: ; CHECK: %[[T06:.+]] = extractelement <2 x i32> %wide.load, i32 1 -; CHECK: %[[T07:.+]] = add nsw i32 %[[T06]], %x -; CHECK: %[[T08:.+]] = extractelement <2 x i32> %wide.load, i32 1 -; CHECK: %[[T09:.+]] = udiv i32 %[[T08]], %[[T07]] +; CHECK: %[[T07:.+]] = extractelement <2 x i32> %wide.load, i32 1 +; CHECK: %[[T08:.+]] = add nsw i32 %[[T07]], %x +; CHECK: %[[T09:.+]] = udiv i32 %[[T06]], %[[T08]] ; CHECK: %[[T10:.+]] = insertelement <2 x i32> %[[T05]], i32 %[[T09]], i32 1 ; CHECK: br label %[[CONT1]] ; CHECK: [[CONT1]]: