diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 750fc40d2615..5d1f85f86df1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -56,6 +56,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" @@ -78,6 +79,7 @@ #include "llvm/IR/Value.h" #include "llvm/IR/Verifier.h" #include "llvm/Pass.h" +#include "llvm/Support/BranchProbability.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/PatternMatch.h" @@ -980,18 +982,27 @@ struct LoopVectorize : public FunctionPass { LoopInfo *LI; TargetTransformInfo *TTI; DominatorTree *DT; + BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; bool DisableUnrolling; bool AlwaysVectorize; + BlockFrequency ColdEntryFreq; + virtual bool runOnFunction(Function &F) { SE = &getAnalysis(); DL = getAnalysisIfAvailable(); LI = &getAnalysis(); TTI = &getAnalysis(); DT = &getAnalysis().getDomTree(); + BFI = &getAnalysis(); TLI = getAnalysisIfAvailable(); + // Compute some weights outside of the loop over the loops. Compute this + // using a BranchProbability to re-use its scaling math. + const BranchProbability ColdProb(1, 5); // 20% + ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; + // If the target claims to have no vector registers don't attempt // vectorization. if (!TTI->getNumberOfRegisters(true)) @@ -1064,6 +1075,13 @@ struct LoopVectorize : public FunctionPass { bool OptForSize = Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize); + // Compute the weighted frequency of this loop being executed and see if it + // is less than 20% of the function entry baseline frequency. Note that we + // always have a canonical loop here because we think we *can* vectoriez. + BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); + if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq) + OptForSize = true; + // Check the function attributes to see if implicit floats are allowed.a // FIXME: This check doesn't seem possibly correct -- what if the loop is // an integer loop and the vector instructions selected are purely integer @@ -1109,6 +1127,7 @@ struct LoopVectorize : public FunctionPass { virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); @@ -5469,6 +5488,7 @@ char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LCSSA) diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll index 14ac417bb573..1d46366369c8 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -115,6 +115,31 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture ret void } +; N is unknown, we need a tail. Can't vectorize because the loop is cold. +;CHECK-LABEL: @example4( +;CHECK-NOT: <4 x i32> +;CHECK: ret void +define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) { + %1 = icmp eq i32 %n, 0 + br i1 %1, label %._crit_edge, label %.lr.ph, !prof !0 + +.lr.ph: ; preds = %0, %.lr.ph + %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ] + %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ] + %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ] + %2 = add nsw i32 %.05, -1 + %3 = getelementptr inbounds i32* %.023, i64 1 + %4 = load i32* %.023, align 16 + %5 = getelementptr inbounds i32* %.014, i64 1 + store i32 %4, i32* %.014, align 16 + %6 = icmp eq i32 %2, 0 + br i1 %6, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + ret void +} + +!0 = metadata !{metadata !"branch_weights", i32 64, i32 4} ; We can't vectorize this one because we need a runtime ptr check. ;CHECK-LABEL: @example23(