forked from OSchip/llvm-project
[BOLT] Tail duplication analysis pass
Summary: Created a binary pass that records how many times tail duplication would be used and how many cache misses it would theoretically stop (cherry picked from FBD29619858)
This commit is contained in:
parent
60b15062e1
commit
2f46660559
|
@ -27,6 +27,7 @@
|
|||
#include "Passes/RetpolineInsertion.h"
|
||||
#include "Passes/SplitFunctions.h"
|
||||
#include "Passes/StokeInfo.h"
|
||||
#include "Passes/TailDuplication.h"
|
||||
#include "Passes/ValidateInternalCalls.h"
|
||||
#include "Passes/VeneerElimination.h"
|
||||
#include "llvm/Support/FormatVariadic.h"
|
||||
|
@ -76,6 +77,11 @@ JTFootprintReductionFlag("jt-footprint-reduction",
|
|||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool> TailDuplicationFlag(
|
||||
"tail-duplication",
|
||||
cl::desc("duplicate unconditional branches that cross a cache line"),
|
||||
cl::ZeroOrMore, cl::ReallyHidden, cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintJTFootprintReduction("print-after-jt-footprint-reduction",
|
||||
cl::desc("print function after jt-footprint-reduction pass"),
|
||||
|
@ -449,6 +455,9 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
|
|||
|
||||
Manager.registerPass(std::make_unique<LoopInversionPass>());
|
||||
|
||||
Manager.registerPass(std::make_unique<TailDuplication>(),
|
||||
opts::TailDuplicationFlag);
|
||||
|
||||
// This pass syncs local branches with CFG. If any of the following
|
||||
// passes breaks the sync - they either need to re-run the pass or
|
||||
// fix branches consistency internally.
|
||||
|
|
|
@ -36,6 +36,7 @@ add_llvm_library(LLVMBOLTPasses
|
|||
StackPointerTracking.cpp
|
||||
StackReachingUses.cpp
|
||||
StokeInfo.cpp
|
||||
TailDuplication.cpp
|
||||
ValidateInternalCalls.cpp
|
||||
VeneerElimination.cpp
|
||||
RetpolineInsertion.cpp
|
||||
|
|
|
@ -0,0 +1,172 @@
|
|||
//===--------- Passes/TailDuplication.cpp -------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "TailDuplication.h"
|
||||
|
||||
#include <numeric>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace opts {
|
||||
|
||||
extern cl::OptionCategory BoltOptCategory;
|
||||
|
||||
static cl::opt<bool> TailDuplicationAggressive(
|
||||
"tail-duplication-aggressive",
|
||||
cl::desc("tail duplication should act aggressively in duplicating multiple "
|
||||
"blocks per tail"),
|
||||
cl::ZeroOrMore, cl::ReallyHidden, cl::init(false),
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
TailDuplicationMinimumOffset("tail-duplication-minimum-offset",
|
||||
cl::desc("minimum offset needed between block "
|
||||
"and successor to allow duplication"),
|
||||
cl::ZeroOrMore, cl::ReallyHidden, cl::init(64),
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<unsigned> TailDuplicationMaximumDuplication(
|
||||
"tail-duplication-maximum-duplication",
|
||||
cl::desc("maximum size of duplicated blocks (in bytes)"), cl::ZeroOrMore,
|
||||
cl::ReallyHidden, cl::init(64), cl::cat(BoltOptCategory));
|
||||
|
||||
} // namespace opts
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
bool TailDuplication::isInCacheLine(const BinaryBasicBlock &BB,
|
||||
const BinaryBasicBlock &Succ) const {
|
||||
if (&BB == &Succ)
|
||||
return true;
|
||||
|
||||
BinaryFunction::BasicBlockOrderType BlockLayout =
|
||||
BB.getFunction()->getLayout();
|
||||
uint64_t Distance = 0;
|
||||
int Direction = (Succ.getLayoutIndex() > BB.getLayoutIndex()) ? 1 : -1;
|
||||
|
||||
for (unsigned I = BB.getLayoutIndex() + Direction; I != Succ.getLayoutIndex();
|
||||
I += Direction) {
|
||||
Distance += BlockLayout[I]->getOriginalSize();
|
||||
if (Distance > opts::TailDuplicationMinimumOffset)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<BinaryBasicBlock *>
|
||||
TailDuplication::moderateCodeToDuplicate(BinaryBasicBlock &BB) const {
|
||||
std::vector<BinaryBasicBlock *> BlocksToDuplicate;
|
||||
for (auto Itr = BB.succ_begin(); Itr != BB.succ_end(); ++Itr) {
|
||||
if ((*Itr)->getLayoutIndex() == BB.getLayoutIndex() + 1) {
|
||||
// If duplicating would introduce a new branch, don't duplicate
|
||||
return BlocksToDuplicate;
|
||||
}
|
||||
}
|
||||
BlocksToDuplicate.push_back(&BB);
|
||||
return BlocksToDuplicate;
|
||||
}
|
||||
|
||||
std::vector<BinaryBasicBlock *>
|
||||
TailDuplication::aggressiveCodeToDuplicate(BinaryBasicBlock &BB) const {
|
||||
std::vector<BinaryBasicBlock *> BlocksToDuplicate;
|
||||
BinaryBasicBlock *CurrBB = &BB;
|
||||
while (CurrBB) {
|
||||
BlocksToDuplicate.push_back(CurrBB);
|
||||
// With no successors, we've reached the end and should duplicate all of
|
||||
// BlocksToDuplicate
|
||||
if (CurrBB->succ_size() == 0)
|
||||
break;
|
||||
|
||||
// With two successors, if they're both a jump, we should duplicate all
|
||||
// blocks in BlocksToDuplicate. Otherwise, we cannot find a simple stream of
|
||||
// blocks to copy
|
||||
if (CurrBB->succ_size() >= 2) {
|
||||
if (CurrBB->getConditionalSuccessor(false)->getLayoutIndex() ==
|
||||
CurrBB->getLayoutIndex() + 1 ||
|
||||
CurrBB->getConditionalSuccessor(true)->getLayoutIndex() ==
|
||||
CurrBB->getLayoutIndex() + 1)
|
||||
BlocksToDuplicate.clear();
|
||||
break;
|
||||
}
|
||||
|
||||
// With one successor, if its a jump, we should duplicate all blocks in
|
||||
// BlocksToDuplicate. Otherwise, we should keep going
|
||||
BinaryBasicBlock *Succ = CurrBB->getSuccessor();
|
||||
if (Succ->getLayoutIndex() != CurrBB->getLayoutIndex() + 1)
|
||||
break;
|
||||
CurrBB = Succ;
|
||||
}
|
||||
// Don't duplicate if its too much code
|
||||
unsigned DuplicationByteCount = std::accumulate(
|
||||
std::begin(BlocksToDuplicate), std::end(BlocksToDuplicate), 0,
|
||||
[](int value, BinaryBasicBlock *p) {
|
||||
return value + p->getOutputSize();
|
||||
});
|
||||
if (DuplicationByteCount < opts::TailDuplicationMaximumDuplication)
|
||||
BlocksToDuplicate.clear();
|
||||
return BlocksToDuplicate;
|
||||
}
|
||||
|
||||
void TailDuplication::runOnFunction(BinaryFunction &Function) {
|
||||
for (BinaryBasicBlock *BB : Function.layout()) {
|
||||
if (BB->succ_size() == 1 &&
|
||||
BB->getSuccessor()->getLayoutIndex() != BB->getLayoutIndex() + 1)
|
||||
UnconditionalBranchDynamicCount += BB->getExecutionCount();
|
||||
if (BB->succ_size() == 2 &&
|
||||
BB->getFallthrough()->getLayoutIndex() != BB->getLayoutIndex() + 1)
|
||||
UnconditionalBranchDynamicCount += BB->getFallthroughBranchInfo().Count;
|
||||
AllBlocksDynamicCount += BB->getExecutionCount();
|
||||
|
||||
// The block must be hot
|
||||
if (BB->getExecutionCount() == 0)
|
||||
continue;
|
||||
// with one successor
|
||||
if (BB->succ_size() != 1)
|
||||
continue;
|
||||
// and that one successor is not a direct fallthrough
|
||||
BinaryBasicBlock *Succ = BB->getSuccessor();
|
||||
if (isInCacheLine(*BB, *Succ))
|
||||
continue;
|
||||
std::vector<BinaryBasicBlock *> BlocksToDuplicate;
|
||||
if (opts::TailDuplicationAggressive)
|
||||
BlocksToDuplicate = aggressiveCodeToDuplicate(*Succ);
|
||||
else
|
||||
BlocksToDuplicate = moderateCodeToDuplicate(*Succ);
|
||||
if (BlocksToDuplicate.size() > 0) {
|
||||
PossibleDuplications++;
|
||||
PossibleDuplicationsDynamicCount += BB->getExecutionCount();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TailDuplication::runOnFunctions(BinaryContext &BC) {
|
||||
for (auto &It : BC.getBinaryFunctions()) {
|
||||
BinaryFunction &Function = It.second;
|
||||
runOnFunction(Function);
|
||||
}
|
||||
|
||||
outs() << "BOLT-INFO: tail duplication possible duplications: "
|
||||
<< PossibleDuplications << "\n";
|
||||
outs() << "BOLT-INFO: tail duplication possible dynamic reductions: "
|
||||
<< PossibleDuplicationsDynamicCount << "\n";
|
||||
outs() << "BOLT-INFO: tail duplication possible dynamic reductions to "
|
||||
"unconditional branch execution : "
|
||||
<< format("%.1f", ((float)PossibleDuplicationsDynamicCount * 100.0f) /
|
||||
UnconditionalBranchDynamicCount)
|
||||
<< "%\n";
|
||||
outs() << "BOLT-INFO: tail duplication possible dynamic reductions to all "
|
||||
"blocks execution : "
|
||||
<< format("%.1f", ((float)PossibleDuplicationsDynamicCount * 100.0f) /
|
||||
AllBlocksDynamicCount)
|
||||
<< "%\n";
|
||||
}
|
||||
|
||||
} // end namespace bolt
|
||||
} // end namespace llvm
|
|
@ -0,0 +1,84 @@
|
|||
//===--------- Passes/TailDuplication.h ---------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_TAILDUPLICATION_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_PASSES_TAILDUPLICATION_H
|
||||
|
||||
#include "BinaryPasses.h"
|
||||
|
||||
// This pass founds cases when BBs have layout:
|
||||
// #BB0:
|
||||
// <body>
|
||||
// jmp #BB2
|
||||
// ....
|
||||
// #BB1
|
||||
// <body>
|
||||
// #BB2:
|
||||
// <body>
|
||||
//
|
||||
// And duplicates #BB2 and puts it after #BB0:
|
||||
// #BB0:
|
||||
// <body>
|
||||
// #BB2:
|
||||
// <body>
|
||||
// ....
|
||||
// #BB1
|
||||
// <body>
|
||||
// #BB2:
|
||||
// <body>
|
||||
//
|
||||
// The advantage is getting rid of an unconditional branch and hopefully to
|
||||
// improve i-cache performance by reducing fragmentation The disadvantage is
|
||||
// that if there is too much code duplication, we may end up evicting hot cache
|
||||
// lines and causing the opposite effect, hurting i-cache performance This needs
|
||||
// to be well balanced to achieve the optimal effect
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
/// Pass for duplicating blocks that would require a jump.
|
||||
class TailDuplication : public BinaryFunctionPass {
|
||||
/// Record how many possible tail duplications there can be.
|
||||
uint64_t PossibleDuplications = 0;
|
||||
|
||||
/// Record how many times these duplications would get used.
|
||||
uint64_t PossibleDuplicationsDynamicCount = 0;
|
||||
|
||||
/// Record the execution count of all unconditional branches
|
||||
uint64_t UnconditionalBranchDynamicCount = 0;
|
||||
|
||||
/// Record the execution count of all blocks
|
||||
uint64_t AllBlocksDynamicCount = 0;
|
||||
|
||||
/// True if Succ is in the same cache line as BB (approximately)
|
||||
bool isInCacheLine(const BinaryBasicBlock &BB,
|
||||
const BinaryBasicBlock &Succ) const;
|
||||
|
||||
/// Returns a vector of BinaryBasicBlock to copy after BB. If it's empty,
|
||||
/// nothing should be duplicated
|
||||
std::vector<BinaryBasicBlock *>
|
||||
moderateCodeToDuplicate(BinaryBasicBlock &BB) const;
|
||||
std::vector<BinaryBasicBlock *>
|
||||
aggressiveCodeToDuplicate(BinaryBasicBlock &BB) const;
|
||||
|
||||
void runOnFunction(BinaryFunction &Function);
|
||||
|
||||
public:
|
||||
explicit TailDuplication() : BinaryFunctionPass(false) {}
|
||||
|
||||
const char *getName() const override { return "tail duplication"; }
|
||||
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
#endif
|
|
@ -0,0 +1,25 @@
|
|||
# REQUIRES: system-linux
|
||||
|
||||
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
|
||||
# RUN: %s -o %t.o
|
||||
# RUN: link_fdata %s %t.o %t.fdata
|
||||
# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
|
||||
# RUN: llvm-bolt %t.exe -data %t.fdata -reorder-blocks=cache+ -print-finalized \
|
||||
# RUN: -tail-duplication -tail-duplication-minimum-offset 1 -o %t.out | FileCheck %s
|
||||
|
||||
# FDATA: 1 main 2 1 main #.BB2# 0 10
|
||||
# FDATA: 1 main 4 1 main #.BB2# 0 20
|
||||
# CHECK: tail duplication possible duplications: 1
|
||||
|
||||
.text
|
||||
.globl main
|
||||
.type main, %function
|
||||
.size main, .Lend-main
|
||||
main:
|
||||
xor %eax, %eax
|
||||
jmp .BB2
|
||||
.BB1:
|
||||
inc %rax
|
||||
.BB2:
|
||||
retq
|
||||
.Lend:
|
Loading…
Reference in New Issue