From f1dfbfdde1245b1f045bf57e3090782fd37db475 Mon Sep 17 00:00:00 2001 From: Arnold Schwaighofer Date: Sat, 21 Sep 2013 00:06:20 +0000 Subject: [PATCH] Revert "SLPVectorizer: Handle more horizontal reductions (disabled)" This reverts commit r191108. The horizontal.ll test case fails under libgmalloc. Thanks Shuxin for pointing this out to me. llvm-svn: 191121 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 376 +--------------- .../SLPVectorizer/X86/horizontal.ll | 415 ------------------ 2 files changed, 8 insertions(+), 783 deletions(-) delete mode 100644 llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index caedd0986bf2..cd3f723cd3e4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -49,11 +49,6 @@ static cl::opt SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number ")); - -static cl::opt -ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, - cl::desc("Attempt to vectorize horizontal reductions")); - namespace { static const unsigned MinVecRegSize = 128; @@ -243,21 +238,17 @@ public: } /// \brief Vectorize the tree that starts with the elements in \p VL. - /// Returns the vectorized root and the scalar operations the root was based - /// on. - std::pair vectorizeTree(); + void vectorizeTree(); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. int getTreeCost(); - /// Construct a vectorizable tree that starts at \p Roots and is possibly - /// used by a reduction of \p RdxOps. - void buildTree(ArrayRef Roots, ValueSet *RdxOps = 0); + /// Construct a vectorizable tree that starts at \p Roots. + void buildTree(ArrayRef Roots); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { - RdxOps = 0; VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); @@ -410,9 +401,6 @@ private: /// Numbers instructions in different blocks. DenseMap BlocksNumbers; - /// Reduction operators. - ValueSet *RdxOps; - // Analysis and block reference. Function *F; ScalarEvolution *SE; @@ -425,9 +413,8 @@ private: IRBuilder<> Builder; }; -void BoUpSLP::buildTree(ArrayRef Roots, ValueSet *Rdx) { +void BoUpSLP::buildTree(ArrayRef Roots) { deleteTree(); - RdxOps = Rdx; if (!getSameType(Roots)) return; buildTree_rec(Roots, 0); @@ -458,12 +445,8 @@ void BoUpSLP::buildTree(ArrayRef Roots, ValueSet *Rdx) { assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); continue; } - Instruction *UserInst = dyn_cast(*User); - if (!UserInst) - continue; - // Ignore uses that are part of the reduction. - if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end()) + if (!isa(*User)) continue; DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " << @@ -595,10 +578,6 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { continue; } - // This user is part of the reduction. - if (RdxOps && RdxOps->count(User)) - continue; - // Make sure that we can schedule this unknown user. BlockNumbering &BN = BlocksNumbers[BB]; int UserIndex = BN.getIndex(User); @@ -1393,7 +1372,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return 0; } -std::pair BoUpSLP::vectorizeTree() { +void BoUpSLP::vectorizeTree() { Builder.SetInsertPoint(F->getEntryBlock().begin()); vectorizeTree(&VectorizableTree[0]); @@ -1470,10 +1449,7 @@ std::pair BoUpSLP::vectorizeTree() { DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n"); assert(!MustGather.count(*User) && "Replacing gathered value with undef"); - - assert((ScalarToTreeEntry.count(*User) || - // It is legal to replace the reduction users by undef. - (RdxOps && RdxOps->count(*User))) && + assert(ScalarToTreeEntry.count(*User) && "Replacing out-of-tree value with undef"); } Value *Undef = UndefValue::get(Ty); @@ -1488,9 +1464,6 @@ std::pair BoUpSLP::vectorizeTree() { BlocksNumbers[it].forget(); } Builder.ClearInsertionPoint(); - - return std::make_pair(VectorizableTree[0].VectorizedValue, - &VectorizableTree[0].Scalars); } void BoUpSLP::optimizeGatherSequence() { @@ -1914,310 +1887,6 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { return 0; } -/// \brief Generate a shuffle mask to be used in a reduction tree. -/// -/// \param VecLen The length of the vector to be reduced. -/// \param NumEltsToRdx The number of elements that should be reduced in the -/// vector. -/// \param IsPairwise Whether the reduction is a pairwise or splitting -/// reduction. A pairwise reduction will generate a mask of -/// <0,2,...> or <1,3,..> while a splitting reduction will generate -/// <2,3, undef,undef> for a vector of 4 and NumElts = 2. -/// \param IsLeft True will generate a mask of even elements, odd otherwise. -static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, - bool IsPairwise, bool IsLeft, - IRBuilder<> &Builder) { - assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); - - SmallVector ShuffleMask( - VecLen, UndefValue::get(Builder.getInt32Ty())); - - if (IsPairwise) - // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). - for (unsigned i = 0; i != NumEltsToRdx; ++i) - ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); - else - // Move the upper half of the vector to the lower half. - for (unsigned i = 0; i != NumEltsToRdx; ++i) - ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); - - return ConstantVector::get(ShuffleMask); -} - - -/// Model horizontal reductions. -/// -/// A horizontal reduction is a tree of reduction operations (currently add and -/// fadd) that has operations that can be put into a vector as its leaf. -/// For example, this tree: -/// -/// mul mul mul mul -/// \ / \ / -/// + + -/// \ / -/// + -/// This tree has "mul" as its reduced values and "+" as its reduction -/// operations. A reduction might be feeding into a store or a binary operation -/// feeding a phi. -/// ... -/// \ / -/// + -/// \ -/// phi += -/// -/// Or: -/// ... -/// \ / -/// + -/// \ -/// *p = -/// -class HorizontalReduction { - SmallPtrSet ReductionOps; - SmallVector ReducedVals; - - BinaryOperator *ReductionRoot; - PHINode *ReductionPHI; - - /// The opcode of the reduction. - unsigned ReductionOpcode; - /// The opcode of the values we perform a reduction on. - unsigned ReducedValueOpcode; - /// The width of one full horizontal reduction operation. - unsigned ReduxWidth; - /// Should we model this reduction as a pairwise reduction tree or a tree that - /// splits the vector in halves and adds those halves. - bool IsPairwiseReduction; - -public: - HorizontalReduction() - : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0), - ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} - - /// \brief Try to find a reduction tree. - bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, - DataLayout *DL) { - assert((!Phi || - std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && - "Thi phi needs to use the binary operator"); - - // We could have a initial reductions that is not an add. - // r *= v1 + v2 + v3 + v4 - // In such a case start looking for a tree rooted in the first '+'. - if (Phi) { - if (B->getOperand(0) == Phi) { - Phi = 0; - B = dyn_cast(B->getOperand(1)); - } else if (B->getOperand(1) == Phi) { - Phi = 0; - B = dyn_cast(B->getOperand(0)); - } - } - - if (!B) - return false; - - Type *Ty = B->getType(); - if (Ty->isVectorTy()) - return false; - - ReductionOpcode = B->getOpcode(); - ReducedValueOpcode = 0; - ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty); - ReductionRoot = B; - ReductionPHI = Phi; - - if (ReduxWidth < 4) - return false; - - // We currently only support adds. - if (ReductionOpcode != Instruction::Add && - ReductionOpcode != Instruction::FAdd) - return false; - - // Post order traverse the reduction tree starting at B. We only handle true - // trees containing only binary operators. - SmallVector, 32> Stack; - Stack.push_back(std::make_pair(B, 0)); - while (!Stack.empty()) { - BinaryOperator *TreeN = Stack.back().first; - unsigned EdgeToVist = Stack.back().second++; - bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; - - // Only handle trees in the current basic block. - if (TreeN->getParent() != B->getParent()) - return false; - - // Each tree node needs to have one user except for the ultimate - // reduction. - if (!TreeN->hasOneUse() && TreeN != B) - return false; - - // Postorder vist. - if (EdgeToVist == 2 || IsReducedValue) { - if (IsReducedValue) { - // Make sure that the opcodes of the operations that we are going to - // reduce match. - if (!ReducedValueOpcode) - ReducedValueOpcode = TreeN->getOpcode(); - else if (ReducedValueOpcode != TreeN->getOpcode()) - return false; - ReducedVals.push_back(TreeN); - } else { - // We need to be able to reassociate the adds. - if (!TreeN->isAssociative()) - return false; - ReductionOps.insert(TreeN); - } - // Retract. - Stack.pop_back(); - continue; - } - - // Visit left or right. - Value *NextV = TreeN->getOperand(EdgeToVist); - BinaryOperator *Next = dyn_cast(NextV); - if (Next) - Stack.push_back(std::make_pair(Next, 0)); - else if (NextV != Phi) - return false; - } - return true; - } - - /// \brief Attempt to vectorize the tree found by - /// matchAssociativeReduction. - bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { - if (ReducedVals.empty()) - return false; - - unsigned NumReducedVals = ReducedVals.size(); - if (NumReducedVals < ReduxWidth) - return false; - - Value *VectorizedTree = 0; - IRBuilder<> Builder(ReductionRoot); - FastMathFlags Unsafe; - Unsafe.setUnsafeAlgebra(); - Builder.SetFastMathFlags(Unsafe); - unsigned i = 0; - - for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { - ArrayRef ValsToReduce(&ReducedVals[i], ReduxWidth); - V.buildTree(ValsToReduce, &ReductionOps); - - // Estimate cost. - int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); - if (Cost >= -SLPCostThreshold) - break; - - DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost - << ". (HorRdx)\n"); - - // Vectorize a tree. - Value *VectorizedRoot; - BoUpSLP::ValueList *Scalars; - tie(VectorizedRoot, Scalars) = V.vectorizeTree(); - - // Emit a reduction. - Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); - if (VectorizedTree) { - Builder.SetCurrentDebugLocation( - cast((*Scalars)[0])->getDebugLoc()); - VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, - ReducedSubTree, "bin.rdx"); - } else - VectorizedTree = ReducedSubTree; - } - - if (VectorizedTree) { - // Finish the reduction. - for (; i < NumReducedVals; ++i) { - Builder.SetCurrentDebugLocation( - cast(ReducedVals[i])->getDebugLoc()); - VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, - ReducedVals[i]); - } - // Update users. - if (ReductionPHI) { - assert(ReductionRoot != NULL && "Need a reduction operation"); - ReductionRoot->setOperand(0, VectorizedTree); - ReductionRoot->setOperand(1, ReductionPHI); - } else - ReductionRoot->replaceAllUsesWith(VectorizedTree); - } - return VectorizedTree != 0; - } - -private: - - /// \brief Calcuate the cost of a reduction. - int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { - Type *ScalarTy = FirstReducedVal->getType(); - Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); - - int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); - int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); - - IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; - int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; - - int ScalarReduxCost = - ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); - - DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost - << " for reduction that starts with " << *FirstReducedVal - << " (It is a " - << (IsPairwiseReduction ? "pairwise" : "splitting") - << " reduction)\n"); - - return VecReduxCost - ScalarReduxCost; - } - - static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, - Value *R, const Twine &Name = "") { - if (Opcode == Instruction::FAdd) - return Builder.CreateFAdd(L, R, Name); - return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); - } - - /// \brief Emit a horizontal reduction of the vectorized value. - Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { - assert(VectorizedValue && "Need to have a vectorized tree node"); - Instruction *ValToReduce = dyn_cast(VectorizedValue); - assert(isPowerOf2_32(ReduxWidth) && - "We only handle power-of-two reductions for now"); - - SmallVector ShuffleMask(ReduxWidth, 0); - Value *TmpVec = ValToReduce; - for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { - if (IsPairwiseReduction) { - Value *LeftMask = - createRdxShuffleMask(ReduxWidth, i, true, true, Builder); - Value *RightMask = - createRdxShuffleMask(ReduxWidth, i, true, false, Builder); - - Value *LeftShuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); - Value *RightShuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), - "rdx.shuf.r"); - TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, - "bin.rdx"); - } else { - Value *UpperHalf = - createRdxShuffleMask(ReduxWidth, i, false, false, Builder); - Value *Shuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); - TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); - } - } - - // The result is in the first element of the vector. - return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); - } -}; - /// \brief Recognize construction of vectors like /// %ra = insertelement <4 x float> undef, float %s0, i32 0 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 @@ -2312,18 +1981,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (!BI) continue; - // Try to match and vectorize a horizontal reduction. - HorizontalReduction HorRdx; - if (ShouldVectorizeHor && - HorRdx.matchAssociativeReduction(P, BI, DL) && - HorRdx.tryToReduce(R, TTI)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; - } - - Value *Inst = BI->getOperand(0); + Value *Inst = BI->getOperand(0); if (Inst == P) Inst = BI->getOperand(1); @@ -2333,28 +1991,10 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { Changed = true; it = BB->begin(); e = BB->end(); - continue; } - continue; } - // Try to vectorize horizontal reductions feeding into a store. - if (StoreInst *SI = dyn_cast(it)) - if (BinaryOperator *BinOp = - dyn_cast(SI->getValueOperand())) { - HorizontalReduction HorRdx; - if (ShouldVectorizeHor && - ((HorRdx.matchAssociativeReduction(0, BinOp, DL) && - HorRdx.tryToReduce(R, TTI)) || - tryToVectorize(BinOp, R))) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; - } - } - // Try to vectorize trees that start at compare instructions. if (CmpInst *CI = dyn_cast(it)) { if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll deleted file mode 100644 index 9517066ed2e8..000000000000 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll +++ /dev/null @@ -1,415 +0,0 @@ -; RUN: opt -slp-vectorizer -slp-vectorize-hor -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" - -; #include -; -; int foo(float *A, int n) { -; float sum = 0; -; for (intptr_t i=0; i < n; ++i) { -; sum += 7*A[i*4 ] + -; 7*A[i*4+1] + -; 7*A[i*4+2] + -; 7*A[i*4+3]; -; } -; return sum; -; } - -; CHECK-LABEL: add_red -; CHECK: fmul <4 x float> -; CHECK: shufflevector <4 x float> - -define i32 @add_red(float* %A, i32 %n) { -entry: - %cmp31 = icmp sgt i32 %n, 0 - br i1 %cmp31, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: - %0 = sext i32 %n to i64 - br label %for.body - -for.body: - %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] - %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ] - %mul = shl nsw i64 %i.033, 2 - %arrayidx = getelementptr inbounds float* %A, i64 %mul - %1 = load float* %arrayidx, align 4 - %mul2 = fmul float %1, 7.000000e+00 - %add28 = or i64 %mul, 1 - %arrayidx4 = getelementptr inbounds float* %A, i64 %add28 - %2 = load float* %arrayidx4, align 4 - %mul5 = fmul float %2, 7.000000e+00 - %add6 = fadd fast float %mul2, %mul5 - %add829 = or i64 %mul, 2 - %arrayidx9 = getelementptr inbounds float* %A, i64 %add829 - %3 = load float* %arrayidx9, align 4 - %mul10 = fmul float %3, 7.000000e+00 - %add11 = fadd fast float %add6, %mul10 - %add1330 = or i64 %mul, 3 - %arrayidx14 = getelementptr inbounds float* %A, i64 %add1330 - %4 = load float* %arrayidx14, align 4 - %mul15 = fmul float %4, 7.000000e+00 - %add16 = fadd fast float %add11, %mul15 - %add17 = fadd fast float %sum.032, %add16 - %inc = add nsw i64 %i.033, 1 - %exitcond = icmp eq i64 %inc, %0 - br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body - -for.cond.for.end_crit_edge: - %phitmp = fptosi float %add17 to i32 - br label %for.end - -for.end: - %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] - ret i32 %sum.0.lcssa -} - -; int foo(float * restrict A, float * restrict B, int n) { -; float sum = 0; -; for (intptr_t i=0; i < n; ++i) { -; sum *= B[0]*A[i*4 ] + -; B[1]*A[i*4+1] + -; B[2]*A[i*4+2] + -; B[3]*A[i*4+3]; -; } -; return sum; -; } - -; CHECK-LABEL: mul_red -; CHECK: fmul <4 x float> -; CHECK: shufflevector <4 x float> - -define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) { -entry: - %cmp38 = icmp sgt i32 %n, 0 - br i1 %cmp38, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: - %0 = load float* %B, align 4 - %arrayidx4 = getelementptr inbounds float* %B, i64 1 - %1 = load float* %arrayidx4, align 4 - %arrayidx9 = getelementptr inbounds float* %B, i64 2 - %2 = load float* %arrayidx9, align 4 - %arrayidx15 = getelementptr inbounds float* %B, i64 3 - %3 = load float* %arrayidx15, align 4 - %4 = sext i32 %n to i64 - br label %for.body - -for.body: - %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] - %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ] - %mul = shl nsw i64 %i.040, 2 - %arrayidx2 = getelementptr inbounds float* %A, i64 %mul - %5 = load float* %arrayidx2, align 4 - %mul3 = fmul float %0, %5 - %add35 = or i64 %mul, 1 - %arrayidx6 = getelementptr inbounds float* %A, i64 %add35 - %6 = load float* %arrayidx6, align 4 - %mul7 = fmul float %1, %6 - %add8 = fadd fast float %mul3, %mul7 - %add1136 = or i64 %mul, 2 - %arrayidx12 = getelementptr inbounds float* %A, i64 %add1136 - %7 = load float* %arrayidx12, align 4 - %mul13 = fmul float %2, %7 - %add14 = fadd fast float %add8, %mul13 - %add1737 = or i64 %mul, 3 - %arrayidx18 = getelementptr inbounds float* %A, i64 %add1737 - %8 = load float* %arrayidx18, align 4 - %mul19 = fmul float %3, %8 - %add20 = fadd fast float %add14, %mul19 - %mul21 = fmul float %sum.039, %add20 - %inc = add nsw i64 %i.040, 1 - %exitcond = icmp eq i64 %inc, %4 - br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body - -for.cond.for.end_crit_edge: - %phitmp = fptosi float %mul21 to i32 - br label %for.end - -for.end: - %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] - ret i32 %sum.0.lcssa -} - -; int foo(float * restrict A, float * restrict B, int n) { -; float sum = 0; -; for (intptr_t i=0; i < n; ++i) { -; sum += B[0]*A[i*6 ] + -; B[1]*A[i*6+1] + -; B[2]*A[i*6+2] + -; B[3]*A[i*6+3] + -; B[4]*A[i*6+4] + -; B[5]*A[i*6+5] + -; B[6]*A[i*6+6] + -; B[7]*A[i*6+7] + -; B[8]*A[i*6+8]; -; } -; return sum; -; } - -; CHECK-LABEL: long_red -; CHECK: fmul <4 x float> -; CHECK: shufflevector <4 x float> - -define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) { -entry: - %cmp81 = icmp sgt i32 %n, 0 - br i1 %cmp81, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: - %0 = load float* %B, align 4 - %arrayidx4 = getelementptr inbounds float* %B, i64 1 - %1 = load float* %arrayidx4, align 4 - %arrayidx9 = getelementptr inbounds float* %B, i64 2 - %2 = load float* %arrayidx9, align 4 - %arrayidx15 = getelementptr inbounds float* %B, i64 3 - %3 = load float* %arrayidx15, align 4 - %arrayidx21 = getelementptr inbounds float* %B, i64 4 - %4 = load float* %arrayidx21, align 4 - %arrayidx27 = getelementptr inbounds float* %B, i64 5 - %5 = load float* %arrayidx27, align 4 - %arrayidx33 = getelementptr inbounds float* %B, i64 6 - %6 = load float* %arrayidx33, align 4 - %arrayidx39 = getelementptr inbounds float* %B, i64 7 - %7 = load float* %arrayidx39, align 4 - %arrayidx45 = getelementptr inbounds float* %B, i64 8 - %8 = load float* %arrayidx45, align 4 - %9 = sext i32 %n to i64 - br label %for.body - -for.body: - %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] - %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ] - %mul = mul nsw i64 %i.083, 6 - %arrayidx2 = getelementptr inbounds float* %A, i64 %mul - %10 = load float* %arrayidx2, align 4 - %mul3 = fmul fast float %0, %10 - %add80 = or i64 %mul, 1 - %arrayidx6 = getelementptr inbounds float* %A, i64 %add80 - %11 = load float* %arrayidx6, align 4 - %mul7 = fmul fast float %1, %11 - %add8 = fadd fast float %mul3, %mul7 - %add11 = add nsw i64 %mul, 2 - %arrayidx12 = getelementptr inbounds float* %A, i64 %add11 - %12 = load float* %arrayidx12, align 4 - %mul13 = fmul fast float %2, %12 - %add14 = fadd fast float %add8, %mul13 - %add17 = add nsw i64 %mul, 3 - %arrayidx18 = getelementptr inbounds float* %A, i64 %add17 - %13 = load float* %arrayidx18, align 4 - %mul19 = fmul fast float %3, %13 - %add20 = fadd fast float %add14, %mul19 - %add23 = add nsw i64 %mul, 4 - %arrayidx24 = getelementptr inbounds float* %A, i64 %add23 - %14 = load float* %arrayidx24, align 4 - %mul25 = fmul fast float %4, %14 - %add26 = fadd fast float %add20, %mul25 - %add29 = add nsw i64 %mul, 5 - %arrayidx30 = getelementptr inbounds float* %A, i64 %add29 - %15 = load float* %arrayidx30, align 4 - %mul31 = fmul fast float %5, %15 - %add32 = fadd fast float %add26, %mul31 - %add35 = add nsw i64 %mul, 6 - %arrayidx36 = getelementptr inbounds float* %A, i64 %add35 - %16 = load float* %arrayidx36, align 4 - %mul37 = fmul fast float %6, %16 - %add38 = fadd fast float %add32, %mul37 - %add41 = add nsw i64 %mul, 7 - %arrayidx42 = getelementptr inbounds float* %A, i64 %add41 - %17 = load float* %arrayidx42, align 4 - %mul43 = fmul fast float %7, %17 - %add44 = fadd fast float %add38, %mul43 - %add47 = add nsw i64 %mul, 8 - %arrayidx48 = getelementptr inbounds float* %A, i64 %add47 - %18 = load float* %arrayidx48, align 4 - %mul49 = fmul fast float %8, %18 - %add50 = fadd fast float %add44, %mul49 - %add51 = fadd fast float %sum.082, %add50 - %inc = add nsw i64 %i.083, 1 - %exitcond = icmp eq i64 %inc, %9 - br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body - -for.cond.for.end_crit_edge: - %phitmp = fptosi float %add51 to i32 - br label %for.end - -for.end: - %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] - ret i32 %sum.0.lcssa -} - -; int foo(float * restrict A, float * restrict B, int n) { -; float sum = 0; -; for (intptr_t i=0; i < n; ++i) { -; sum += B[0]*A[i*4 ]; -; sum += B[1]*A[i*4+1]; -; sum += B[2]*A[i*4+2]; -; sum += B[3]*A[i*4+3]; -; } -; return sum; -; } - -; CHECK-LABEL: chain_red -; CHECK: fmul <4 x float> -; CHECK: shufflevector <4 x float> - -define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) { -entry: - %cmp41 = icmp sgt i32 %n, 0 - br i1 %cmp41, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: - %0 = load float* %B, align 4 - %arrayidx4 = getelementptr inbounds float* %B, i64 1 - %1 = load float* %arrayidx4, align 4 - %arrayidx10 = getelementptr inbounds float* %B, i64 2 - %2 = load float* %arrayidx10, align 4 - %arrayidx16 = getelementptr inbounds float* %B, i64 3 - %3 = load float* %arrayidx16, align 4 - %4 = sext i32 %n to i64 - br label %for.body - -for.body: - %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] - %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ] - %mul = shl nsw i64 %i.043, 2 - %arrayidx2 = getelementptr inbounds float* %A, i64 %mul - %5 = load float* %arrayidx2, align 4 - %mul3 = fmul fast float %0, %5 - %add = fadd fast float %sum.042, %mul3 - %add638 = or i64 %mul, 1 - %arrayidx7 = getelementptr inbounds float* %A, i64 %add638 - %6 = load float* %arrayidx7, align 4 - %mul8 = fmul fast float %1, %6 - %add9 = fadd fast float %add, %mul8 - %add1239 = or i64 %mul, 2 - %arrayidx13 = getelementptr inbounds float* %A, i64 %add1239 - %7 = load float* %arrayidx13, align 4 - %mul14 = fmul fast float %2, %7 - %add15 = fadd fast float %add9, %mul14 - %add1840 = or i64 %mul, 3 - %arrayidx19 = getelementptr inbounds float* %A, i64 %add1840 - %8 = load float* %arrayidx19, align 4 - %mul20 = fmul fast float %3, %8 - %add21 = fadd fast float %add15, %mul20 - %inc = add nsw i64 %i.043, 1 - %exitcond = icmp eq i64 %inc, %4 - br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body - -for.cond.for.end_crit_edge: - %phitmp = fptosi float %add21 to i32 - br label %for.end - -for.end: - %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] - ret i32 %sum.0.lcssa -} - -; int foo(float * restrict A, float * restrict B, float * restrict C, int n) { -; float sum = 0; -; for (intptr_t i=0; i < n; ++i) { -; C[i] = B[0] *A[i*4 ] + -; B[1] *A[i*4+1] + -; B[2] *A[i*4+2] + -; B[3] *A[i*4+3]; -; } -; return sum; -; } - -; CHECK-LABEL: store_red -; CHECK: fmul <4 x float> -; CHECK: shufflevector <4 x float> - -define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) { -entry: - %cmp37 = icmp sgt i32 %n, 0 - br i1 %cmp37, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: - %arrayidx4 = getelementptr inbounds float* %B, i64 1 - %arrayidx9 = getelementptr inbounds float* %B, i64 2 - %arrayidx15 = getelementptr inbounds float* %B, i64 3 - %0 = sext i32 %n to i64 - br label %for.body - -for.body: - %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] - %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ] - %1 = load float* %B, align 4 - %mul = shl nsw i64 %i.039, 2 - %arrayidx2 = getelementptr inbounds float* %A, i64 %mul - %2 = load float* %arrayidx2, align 4 - %mul3 = fmul fast float %1, %2 - %3 = load float* %arrayidx4, align 4 - %add34 = or i64 %mul, 1 - %arrayidx6 = getelementptr inbounds float* %A, i64 %add34 - %4 = load float* %arrayidx6, align 4 - %mul7 = fmul fast float %3, %4 - %add8 = fadd fast float %mul3, %mul7 - %5 = load float* %arrayidx9, align 4 - %add1135 = or i64 %mul, 2 - %arrayidx12 = getelementptr inbounds float* %A, i64 %add1135 - %6 = load float* %arrayidx12, align 4 - %mul13 = fmul fast float %5, %6 - %add14 = fadd fast float %add8, %mul13 - %7 = load float* %arrayidx15, align 4 - %add1736 = or i64 %mul, 3 - %arrayidx18 = getelementptr inbounds float* %A, i64 %add1736 - %8 = load float* %arrayidx18, align 4 - %mul19 = fmul fast float %7, %8 - %add20 = fadd fast float %add14, %mul19 - store float %add20, float* %C.addr.038, align 4 - %incdec.ptr = getelementptr inbounds float* %C.addr.038, i64 1 - %inc = add nsw i64 %i.039, 1 - %exitcond = icmp eq i64 %inc, %0 - br i1 %exitcond, label %for.end, label %for.body - -for.end: - ret i32 0 -} - - -; void foo(double * restrict A, double * restrict B, double * restrict C, -; int n) { -; for (intptr_t i=0; i < n; ++i) { -; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1]; -; } -; } - -; CHECK-LABEL: store_red_double -; CHECK: fmul <2 x double> -; CHECK: extractelement <2 x double> -; CHECK: extractelement <2 x double> - -define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) { -entry: - %cmp17 = icmp sgt i32 %n, 0 - br i1 %cmp17, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: - %0 = load double* %B, align 8 - %arrayidx4 = getelementptr inbounds double* %B, i64 1 - %1 = load double* %arrayidx4, align 8 - %2 = sext i32 %n to i64 - br label %for.body - -for.body: - %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] - %mul = shl nsw i64 %i.018, 2 - %arrayidx2 = getelementptr inbounds double* %A, i64 %mul - %3 = load double* %arrayidx2, align 8 - %mul3 = fmul fast double %0, %3 - %add16 = or i64 %mul, 1 - %arrayidx6 = getelementptr inbounds double* %A, i64 %add16 - %4 = load double* %arrayidx6, align 8 - %mul7 = fmul fast double %1, %4 - %add8 = fadd fast double %mul3, %mul7 - %arrayidx9 = getelementptr inbounds double* %C, i64 %i.018 - store double %add8, double* %arrayidx9, align 8 - %inc = add nsw i64 %i.018, 1 - %exitcond = icmp eq i64 %inc, %2 - br i1 %exitcond, label %for.end, label %for.body - -for.end: - ret void -}