[LV] Vectorizer should adjust trip count in profile information

Summary: Vectorized loop processes VFxUF number of elements in one iteration thus total number of iterations decreases proportionally. In addition epilog loop may not have more than VFxUF - 1 iterations. This patch updates profile information accordingly.

Reviewers: hsaito, Ayal, fhahn, reames, silvas, dcaballe, SjoerdMeijer, mkuper, DaniilSuchkov

Reviewed By: Ayal, DaniilSuchkov

Subscribers: fedor.sergeev, hiraditya, rkruppe, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D67905
This commit is contained in:
Evgeniy Brevnov 2019-12-27 12:39:24 +07:00
parent 1f946ee2fa
commit af7e158872
5 changed files with 235 additions and 13 deletions

View File

@ -262,10 +262,22 @@ TransformationMode hasLICMVersioningTransformation(Loop *L);
void addStringMetadataToLoop(Loop *TheLoop, const char *MDString, void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
unsigned V = 0); unsigned V = 0);
/// Get a loop's estimated trip count based on branch weight metadata. /// Returns a loop's estimated trip count based on branch weight metadata.
/// In addition if \p EstimatedLoopInvocationWeight is not null it is
/// initialized with weight of loop's latch leading to the exit.
/// Returns 0 when the count is estimated to be 0, or None when a meaningful /// Returns 0 when the count is estimated to be 0, or None when a meaningful
/// estimate can not be made. /// estimate can not be made.
Optional<unsigned> getLoopEstimatedTripCount(Loop *L); Optional<unsigned>
getLoopEstimatedTripCount(Loop *L,
unsigned *EstimatedLoopInvocationWeight = nullptr);
/// Set a loop's branch weight metadata to reflect that loop has \p
/// EstimatedTripCount iterations and \p EstimatedLoopInvocationWeight exits
/// through latch. Returns true if metadata is successfully updated, false
/// otherwise. Note that loop must have a latch block which controls loop exit
/// in order to succeed.
bool setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
unsigned EstimatedLoopInvocationWeight);
/// Check inner loop (L) backedge count is known to be invariant on all /// Check inner loop (L) backedge count is known to be invariant on all
/// iterations of its outer loop. If the loop has no parent, this is trivially /// iterations of its outer loop. If the loop has no parent, this is trivially
@ -370,6 +382,23 @@ int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
DominatorTree *DT, ReplaceExitVal ReplaceExitValue, DominatorTree *DT, ReplaceExitVal ReplaceExitValue,
SmallVector<WeakTrackingVH, 16> &DeadInsts); SmallVector<WeakTrackingVH, 16> &DeadInsts);
/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for
/// \p OrigLoop and the following distribution of \p OrigLoop iteration among \p
/// UnrolledLoop and \p RemainderLoop. \p UnrolledLoop receives weights that
/// reflect TC/UF iterations, and \p RemainderLoop receives weights that reflect
/// the remaining TC%UF iterations.
///
/// Note that \p OrigLoop may be equal to either \p UnrolledLoop or \p
/// RemainderLoop in which case weights for \p OrigLoop are updated accordingly.
/// Note also behavior is undefined if \p UnrolledLoop and \p RemainderLoop are
/// equal. \p UF must be greater than zero.
/// If \p OrigLoop has no profile info associated nothing happens.
///
/// This utility may be useful for such optimizations as unroller and
/// vectorizer as it's typical transformation for them.
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop,
Loop *RemainderLoop, uint64_t UF);
} // end namespace llvm } // end namespace llvm
#endif // LLVM_TRANSFORMS_UTILS_LOOPUTILS_H #endif // LLVM_TRANSFORMS_UTILS_LOOPUTILS_H

View File

@ -32,6 +32,7 @@
#include "llvm/IR/Dominators.h" #include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h" #include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h" #include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h" #include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ValueHandle.h" #include "llvm/IR/ValueHandle.h"
@ -690,17 +691,17 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
} }
} }
Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) { /// Checks if \p L has single exit through latch block except possibly
// Support loops with an exiting latch and other existing exists only /// "deoptimizing" exits. Returns branch instruction terminating the loop
// deoptimize. /// latch if above check is successful, nullptr otherwise.
static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
// Get the branch weights for the loop's backedge.
BasicBlock *Latch = L->getLoopLatch(); BasicBlock *Latch = L->getLoopLatch();
if (!Latch) if (!Latch)
return None; return nullptr;
BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator()); BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator());
if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch)) if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch))
return None; return nullptr;
assert((LatchBR->getSuccessor(0) == L->getHeader() || assert((LatchBR->getSuccessor(0) == L->getHeader() ||
LatchBR->getSuccessor(1) == L->getHeader()) && LatchBR->getSuccessor(1) == L->getHeader()) &&
@ -711,21 +712,36 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
if (any_of(ExitBlocks, [](const BasicBlock *EB) { if (any_of(ExitBlocks, [](const BasicBlock *EB) {
return !EB->getTerminatingDeoptimizeCall(); return !EB->getTerminatingDeoptimizeCall();
})) }))
return nullptr;
return LatchBR;
}
Optional<unsigned>
llvm::getLoopEstimatedTripCount(Loop *L,
unsigned *EstimatedLoopInvocationWeight) {
// Support loops with an exiting latch and other existing exists only
// deoptimize.
BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
if (!LatchBranch)
return None; return None;
// To estimate the number of times the loop body was executed, we want to // To estimate the number of times the loop body was executed, we want to
// know the number of times the backedge was taken, vs. the number of times // know the number of times the backedge was taken, vs. the number of times
// we exited the loop. // we exited the loop.
uint64_t BackedgeTakenWeight, LatchExitWeight; uint64_t BackedgeTakenWeight, LatchExitWeight;
if (!LatchBR->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight)) if (!LatchBranch->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
return None; return None;
if (LatchBR->getSuccessor(0) != L->getHeader()) if (LatchBranch->getSuccessor(0) != L->getHeader())
std::swap(BackedgeTakenWeight, LatchExitWeight); std::swap(BackedgeTakenWeight, LatchExitWeight);
if (!LatchExitWeight) if (!LatchExitWeight)
return None; return None;
if (EstimatedLoopInvocationWeight)
*EstimatedLoopInvocationWeight = LatchExitWeight;
// Estimated backedge taken count is a ratio of the backedge taken weight by // Estimated backedge taken count is a ratio of the backedge taken weight by
// the weight of the edge exiting the loop, rounded to nearest. // the weight of the edge exiting the loop, rounded to nearest.
uint64_t BackedgeTakenCount = uint64_t BackedgeTakenCount =
@ -734,6 +750,37 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
return BackedgeTakenCount + 1; return BackedgeTakenCount + 1;
} }
bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
unsigned EstimatedloopInvocationWeight) {
// Support loops with an exiting latch and other existing exists only
// deoptimize.
BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
if (!LatchBranch)
return false;
// Calculate taken and exit weights.
unsigned LatchExitWeight = 0;
unsigned BackedgeTakenWeight = 0;
if (EstimatedTripCount > 0) {
LatchExitWeight = EstimatedloopInvocationWeight;
BackedgeTakenWeight = (EstimatedTripCount - 1) * LatchExitWeight;
}
// Make a swap if back edge is taken when condition is "false".
if (LatchBranch->getSuccessor(0) != L->getHeader())
std::swap(BackedgeTakenWeight, LatchExitWeight);
MDBuilder MDB(LatchBranch->getContext());
// Set/Update profile metadata.
LatchBranch->setMetadata(
LLVMContext::MD_prof,
MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight));
return true;
}
bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop, bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
ScalarEvolution &SE) { ScalarEvolution &SE) {
Loop *OuterL = InnerLoop->getParentLoop(); Loop *OuterL = InnerLoop->getParentLoop();
@ -1351,3 +1398,29 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI,
Rewriter.clearInsertPoint(); Rewriter.clearInsertPoint();
return NumReplaced; return NumReplaced;
} }
/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for
/// \p OrigLoop.
void llvm::setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop,
Loop *RemainderLoop, uint64_t UF) {
assert(UF > 0 && "Zero unrolled factor is not supported");
assert(UnrolledLoop != RemainderLoop &&
"Unrolled and Remainder loops are expected to distinct");
// Get number of iterations in the original scalar loop.
unsigned OrigLoopInvocationWeight = 0;
Optional<unsigned> OrigAverageTripCount =
getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
if (!OrigAverageTripCount)
return;
// Calculate number of iterations in unrolled loop.
unsigned UnrolledAverageTripCount = *OrigAverageTripCount / UF;
// Calculate number of iterations for remainder loop.
unsigned RemainderAverageTripCount = *OrigAverageTripCount % UF;
setLoopEstimatedTripCount(UnrolledLoop, UnrolledAverageTripCount,
OrigLoopInvocationWeight);
setLoopEstimatedTripCount(RemainderLoop, RemainderAverageTripCount,
OrigLoopInvocationWeight);
}

View File

@ -3483,6 +3483,19 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
// Remove redundant induction instructions. // Remove redundant induction instructions.
cse(LoopVectorBody); cse(LoopVectorBody);
// Set/update profile weights for the vector and remainder loops as original
// loop iterations are now distributed among them. Note that original loop
// represented by LoopScalarBody becomes remainder loop after vectorization.
//
// For cases like foldTailByMasking() and requiresScalarEpiloque() we may
// end up getting slightly roughened result but that should be OK since
// profile is not inherently precise anyway. Note also possible bypass of
// vector code caused by legality checks is ignored, assigning all the weight
// to the vector loop, optimistically.
setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
LI->getLoopFor(LoopVectorBody),
LI->getLoopFor(LoopScalarBody), VF * UF);
} }
void InnerLoopVectorizer::fixCrossIterationPHIs() { void InnerLoopVectorizer::fixCrossIterationPHIs() {

View File

@ -0,0 +1,96 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 -S < %s | FileCheck %s
; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=4 -S < %s | FileCheck %s -check-prefix=CHECK-MASKED
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@a = dso_local global [1024 x i32] zeroinitializer, align 16
@b = dso_local global [1024 x i32] zeroinitializer, align 16
; Check correctness of profile info for vectorization without epilog.
; Function Attrs: nofree norecurse nounwind uwtable
define dso_local void @_Z3foov() local_unnamed_addr #0 {
; CHECK-LABEL: @_Z3foov(
; CHECK: [[VECTOR_BODY:vector\.body]]:
; CHECK: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]],
; CHECK: [[FOR_BODY:for\.body]]:
; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
; CHECK-MASKED: [[VECTOR_BODY:vector\.body]]:
; CHECK-MASKED: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]],
; CHECK-MASKED: [[FOR_BODY:for\.body]]:
; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
;
entry:
br label %for.body
for.cond.cleanup: ; preds = %for.body
ret void
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4, !tbaa !2
%1 = trunc i64 %indvars.iv to i32
%mul = mul nsw i32 %0, %1
%arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv
%2 = load i32, i32* %arrayidx2, align 4, !tbaa !2
%add = add nsw i32 %2, %mul
store i32 %add, i32* %arrayidx2, align 4, !tbaa !2
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1024
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !6
}
; Check correctness of profile info for vectorization with epilog.
; Function Attrs: nofree norecurse nounwind uwtable
define dso_local void @_Z3foo2v() local_unnamed_addr #0 {
; CHECK-LABEL: @_Z3foo2v(
; CHECK: [[VECTOR_BODY:vector\.body]]:
; CHECK: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]],
; CHECK: [[FOR_BODY:for\.body]]:
; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]],
; CHECK-MASKED: [[VECTOR_BODY:vector\.body]]:
; CHECK-MASKED: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]],
; CHECK-MASKED: [[FOR_BODY:for\.body]]:
; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]],
;
entry:
br label %for.body
for.cond.cleanup: ; preds = %for.body
ret void
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4, !tbaa !2
%1 = trunc i64 %indvars.iv to i32
%mul = mul nsw i32 %0, %1
%arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv
%2 = load i32, i32* %arrayidx2, align 4, !tbaa !2
%add = add nsw i32 %2, %mul
store i32 %add, i32* %arrayidx2, align 4, !tbaa !2
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1027
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !7
}
attributes #0 = { "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
; CHECK: [[LP1_255]] = !{!"branch_weights", i32 1, i32 255}
; CHECK: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
; CHECK-MASKED: [[LP1_63]] = !{!"branch_weights", i32 1, i32 63}
; CHECK-MASKED: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
; CHECK: [[LP1_2]] = !{!"branch_weights", i32 1, i32 2}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project c292b5b5e059e6ce3e6449e6827ef7e1037c21c4)"}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C++ TBAA"}
!6 = !{!"branch_weights", i32 1, i32 1023}
!7 = !{!"branch_weights", i32 1, i32 1026}

View File

@ -61,8 +61,10 @@ define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {
; but has a high trip count per invocation. Vectorize it. ; but has a high trip count per invocation. Vectorize it.
; CHECK-LABEL: @foo_low_trip_count3( ; CHECK-LABEL: @foo_low_trip_count3(
; CHECK: vector.body: ; CHECK: [[VECTOR_BODY:vector\.body]]:
; CHECK: br i1 [[TMP9:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP3:\!.*]],
; CHECK: [[FOR_BODY:for\.body]]:
; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP6:\!.*]],
entry: entry:
br i1 %cond, label %for.preheader, label %for.end, !prof !2 br i1 %cond, label %for.preheader, label %for.end, !prof !2
@ -205,6 +207,15 @@ for.end: ; preds = %for.body
ret i32 0 ret i32 0
} }
; CHECK: [[LP3]] = !{!"branch_weights", i32 10, i32 2490}
; CHECK: [[LP6]] = !{!"branch_weights", i32 10, i32 0}
; original loop has latchExitWeight=10 and backedgeTakenWeight=10,000,
; therefore estimatedBackedgeTakenCount=1,000 and estimatedTripCount=1,001.
; Vectorizing by 4 produces estimatedTripCounts of 1,001/4=250 and 1,001%4=1
; for vectorized and remainder loops, respectively, therefore their
; estimatedBackedgeTakenCounts are 249 and 0, and so the weights recorded with
; loop invocation weights of 10 are the above {10, 2490} and {10, 0}.
!0 = !{!"function_entry_count", i64 100} !0 = !{!"function_entry_count", i64 100}
!1 = !{!"branch_weights", i32 100, i32 0} !1 = !{!"branch_weights", i32 100, i32 0}
!2 = !{!"branch_weights", i32 10, i32 90} !2 = !{!"branch_weights", i32 10, i32 90}