forked from OSchip/llvm-project
[vectorize] Initial version of respecting PGO in the vectorizer: treat
cold loops as-if they were being optimized for size. Nothing fancy here. Simply test case included. The nice thing is that we can now incrementally build on top of this to drive other heuristics. All of the infrastructure work is done to get the profile information into this layer. The remaining work necessary to make this a fully general purpose loop unroller for very hot loops is to make it a fully general purpose loop unroller. Things I know of but am not going to have time to benchmark and fix in the immediate future: 1) Don't disable the entire pass when the target is lacking vector registers. This really doesn't make any sense any more. 2) Teach the unroller at least and the vectorizer potentially to handle non-if-converted loops. This is trivial for the unroller but hard for the vectorizer. 3) Compute the relative hotness of the loop and thread that down to the various places that make cost tradeoffs (very likely only the unroller makes sense here, and then only when dealing with loops that are small enough for unrolling to not completely blow out the LSD). I'm still dubious how useful hotness information will be. So far, my experiments show that if we can get the correct logic for determining when unrolling actually helps performance, the code size impact is completely unimportant and we can unroll in all cases. But at least we'll no longer burn code size on cold code. One somewhat unrelated idea that I've had forever but not had time to implement: mark all functions which are only reachable via the global constructors rigging in the module as optsize. This would also decrease the impact of any more aggressive heuristics here on code size. llvm-svn: 200219
This commit is contained in:
parent
9e709bce86
commit
e24f3973eb
|
@ -56,6 +56,7 @@
|
||||||
#include "llvm/ADT/SmallVector.h"
|
#include "llvm/ADT/SmallVector.h"
|
||||||
#include "llvm/ADT/StringExtras.h"
|
#include "llvm/ADT/StringExtras.h"
|
||||||
#include "llvm/Analysis/AliasAnalysis.h"
|
#include "llvm/Analysis/AliasAnalysis.h"
|
||||||
|
#include "llvm/Analysis/BlockFrequencyInfo.h"
|
||||||
#include "llvm/Analysis/LoopInfo.h"
|
#include "llvm/Analysis/LoopInfo.h"
|
||||||
#include "llvm/Analysis/LoopIterator.h"
|
#include "llvm/Analysis/LoopIterator.h"
|
||||||
#include "llvm/Analysis/LoopPass.h"
|
#include "llvm/Analysis/LoopPass.h"
|
||||||
|
@ -78,6 +79,7 @@
|
||||||
#include "llvm/IR/Value.h"
|
#include "llvm/IR/Value.h"
|
||||||
#include "llvm/IR/Verifier.h"
|
#include "llvm/IR/Verifier.h"
|
||||||
#include "llvm/Pass.h"
|
#include "llvm/Pass.h"
|
||||||
|
#include "llvm/Support/BranchProbability.h"
|
||||||
#include "llvm/Support/CommandLine.h"
|
#include "llvm/Support/CommandLine.h"
|
||||||
#include "llvm/Support/Debug.h"
|
#include "llvm/Support/Debug.h"
|
||||||
#include "llvm/Support/PatternMatch.h"
|
#include "llvm/Support/PatternMatch.h"
|
||||||
|
@ -980,18 +982,27 @@ struct LoopVectorize : public FunctionPass {
|
||||||
LoopInfo *LI;
|
LoopInfo *LI;
|
||||||
TargetTransformInfo *TTI;
|
TargetTransformInfo *TTI;
|
||||||
DominatorTree *DT;
|
DominatorTree *DT;
|
||||||
|
BlockFrequencyInfo *BFI;
|
||||||
TargetLibraryInfo *TLI;
|
TargetLibraryInfo *TLI;
|
||||||
bool DisableUnrolling;
|
bool DisableUnrolling;
|
||||||
bool AlwaysVectorize;
|
bool AlwaysVectorize;
|
||||||
|
|
||||||
|
BlockFrequency ColdEntryFreq;
|
||||||
|
|
||||||
virtual bool runOnFunction(Function &F) {
|
virtual bool runOnFunction(Function &F) {
|
||||||
SE = &getAnalysis<ScalarEvolution>();
|
SE = &getAnalysis<ScalarEvolution>();
|
||||||
DL = getAnalysisIfAvailable<DataLayout>();
|
DL = getAnalysisIfAvailable<DataLayout>();
|
||||||
LI = &getAnalysis<LoopInfo>();
|
LI = &getAnalysis<LoopInfo>();
|
||||||
TTI = &getAnalysis<TargetTransformInfo>();
|
TTI = &getAnalysis<TargetTransformInfo>();
|
||||||
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||||
|
BFI = &getAnalysis<BlockFrequencyInfo>();
|
||||||
TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
|
TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
|
||||||
|
|
||||||
|
// Compute some weights outside of the loop over the loops. Compute this
|
||||||
|
// using a BranchProbability to re-use its scaling math.
|
||||||
|
const BranchProbability ColdProb(1, 5); // 20%
|
||||||
|
ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
|
||||||
|
|
||||||
// If the target claims to have no vector registers don't attempt
|
// If the target claims to have no vector registers don't attempt
|
||||||
// vectorization.
|
// vectorization.
|
||||||
if (!TTI->getNumberOfRegisters(true))
|
if (!TTI->getNumberOfRegisters(true))
|
||||||
|
@ -1064,6 +1075,13 @@ struct LoopVectorize : public FunctionPass {
|
||||||
bool OptForSize =
|
bool OptForSize =
|
||||||
Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize);
|
Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize);
|
||||||
|
|
||||||
|
// Compute the weighted frequency of this loop being executed and see if it
|
||||||
|
// is less than 20% of the function entry baseline frequency. Note that we
|
||||||
|
// always have a canonical loop here because we think we *can* vectoriez.
|
||||||
|
BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
|
||||||
|
if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq)
|
||||||
|
OptForSize = true;
|
||||||
|
|
||||||
// Check the function attributes to see if implicit floats are allowed.a
|
// Check the function attributes to see if implicit floats are allowed.a
|
||||||
// FIXME: This check doesn't seem possibly correct -- what if the loop is
|
// FIXME: This check doesn't seem possibly correct -- what if the loop is
|
||||||
// an integer loop and the vector instructions selected are purely integer
|
// an integer loop and the vector instructions selected are purely integer
|
||||||
|
@ -1109,6 +1127,7 @@ struct LoopVectorize : public FunctionPass {
|
||||||
virtual void getAnalysisUsage(AnalysisUsage &AU) const {
|
virtual void getAnalysisUsage(AnalysisUsage &AU) const {
|
||||||
AU.addRequiredID(LoopSimplifyID);
|
AU.addRequiredID(LoopSimplifyID);
|
||||||
AU.addRequiredID(LCSSAID);
|
AU.addRequiredID(LCSSAID);
|
||||||
|
AU.addRequired<BlockFrequencyInfo>();
|
||||||
AU.addRequired<DominatorTreeWrapperPass>();
|
AU.addRequired<DominatorTreeWrapperPass>();
|
||||||
AU.addRequired<LoopInfo>();
|
AU.addRequired<LoopInfo>();
|
||||||
AU.addRequired<ScalarEvolution>();
|
AU.addRequired<ScalarEvolution>();
|
||||||
|
@ -5469,6 +5488,7 @@ char LoopVectorize::ID = 0;
|
||||||
static const char lv_name[] = "Loop Vectorization";
|
static const char lv_name[] = "Loop Vectorization";
|
||||||
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
|
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
|
||||||
INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
|
INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
|
||||||
|
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
|
||||||
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
|
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
|
||||||
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
|
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
|
||||||
INITIALIZE_PASS_DEPENDENCY(LCSSA)
|
INITIALIZE_PASS_DEPENDENCY(LCSSA)
|
||||||
|
|
|
@ -115,6 +115,31 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; N is unknown, we need a tail. Can't vectorize because the loop is cold.
|
||||||
|
;CHECK-LABEL: @example4(
|
||||||
|
;CHECK-NOT: <4 x i32>
|
||||||
|
;CHECK: ret void
|
||||||
|
define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) {
|
||||||
|
%1 = icmp eq i32 %n, 0
|
||||||
|
br i1 %1, label %._crit_edge, label %.lr.ph, !prof !0
|
||||||
|
|
||||||
|
.lr.ph: ; preds = %0, %.lr.ph
|
||||||
|
%.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
|
||||||
|
%.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
|
||||||
|
%.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
|
||||||
|
%2 = add nsw i32 %.05, -1
|
||||||
|
%3 = getelementptr inbounds i32* %.023, i64 1
|
||||||
|
%4 = load i32* %.023, align 16
|
||||||
|
%5 = getelementptr inbounds i32* %.014, i64 1
|
||||||
|
store i32 %4, i32* %.014, align 16
|
||||||
|
%6 = icmp eq i32 %2, 0
|
||||||
|
br i1 %6, label %._crit_edge, label %.lr.ph
|
||||||
|
|
||||||
|
._crit_edge: ; preds = %.lr.ph, %0
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
|
||||||
|
|
||||||
; We can't vectorize this one because we need a runtime ptr check.
|
; We can't vectorize this one because we need a runtime ptr check.
|
||||||
;CHECK-LABEL: @example23(
|
;CHECK-LABEL: @example23(
|
||||||
|
|
Loading…
Reference in New Issue