[BOLT] Add an option to specialize memcpy() for 1 byte copy

Summary:
Add an option:

  -memcpy1-spec=func1,func2:cs1,func3:cs1:cs2,...

to specialize calls to memcpy() in listed functions (the name could be
supplied in regex) for size 1. The optimization will dynamically check
if the size argument equals to 1 and execute a one byte copy, otherwise
it will call memcpy() as usual. Specific call sites could be indicated
after ":" using their numeric count from the start of the function.

(cherry picked from FBD15428936)
This commit is contained in:
Maksim Panchenko 2019-05-20 20:11:40 -07:00
parent ca659e4336
commit d047df12c5
8 changed files with 268 additions and 7 deletions

View File

@ -456,6 +456,13 @@ public:
}
}
/// Add a range of instructions to the end of this basic block.
template <typename RangeTy>
void addInstructions(RangeTy R) {
for(auto &I : R)
addInstruction(I);
}
/// Add instruction before Pos in this basic block.
template <typename Itr>
Itr insertPseudoInstr(Itr Pos, MCInst &Instr) {

View File

@ -683,6 +683,10 @@ public:
return iterator_range<const_iterator>(begin(), end());
}
// Iterators by pointer.
BasicBlockListType::iterator pbegin() { return BasicBlocks.begin(); }
BasicBlockListType::iterator pend() { return BasicBlocks.end(); }
order_iterator layout_begin() { return BasicBlocksLayout.begin(); }
const_order_iterator layout_begin() const
{ return BasicBlocksLayout.begin(); }

View File

@ -228,6 +228,14 @@ StringOps("inline-memcpy",
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
static cl::list<std::string>
SpecializeMemcpy1("memcpy1-spec",
cl::desc("list of functions with call sites for which to specialize memcpy() "
"for size 1"),
cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."),
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
static cl::opt<bool>
StripRepRet("strip-rep-ret",
cl::desc("strip 'repz' prefix from 'repz retq' sequence (on by default)"),
@ -370,7 +378,12 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
opts::ICF);
if (BC.isAArch64())
Manager.registerPass(llvm::make_unique<VeneerElimination>(PrintVeneerElimination));
Manager.registerPass(
llvm::make_unique<VeneerElimination>(PrintVeneerElimination));
Manager.registerPass(
llvm::make_unique<SpecializeMemcpy1>(NeverPrint, opts::SpecializeMemcpy1),
!opts::SpecializeMemcpy1.empty());
Manager.registerPass(llvm::make_unique<InlineMemcpy>(NeverPrint),
opts::StringOps);

View File

@ -368,7 +368,18 @@ public:
llvm_unreachable("not implemented");
}
virtual MCPhysReg getX86NoRegister() const {
/// Return a register number that is guaranteed to not match with
/// any real register on the underlying architecture.
virtual MCPhysReg getNoRegister() const {
llvm_unreachable("not implemented");
}
/// Return a register corresponding to a function integer argument \p ArgNo
/// if the argument is passed in a register. Or return the result of
/// getNoRegister() otherwise. The enumeration starts at 0.
///
/// Note: this should depend on a used calling convention.
virtual MCPhysReg getIntArgRegister(unsigned ArgNo) const {
llvm_unreachable("not implemented");
}
@ -1359,6 +1370,21 @@ public:
return true;
}
/// Create an inline version of memcpy(dest, src, 1).
virtual std::vector<MCInst> createOneByteMemcpy() const {
llvm_unreachable("not implemented");
return {};
}
/// Create a sequence of instructions to compare contents of a register
/// \p RegNo to immediate \Imm and jump to \p Target if they are equal.
virtual std::vector<MCInst>
createCmpJE(MCPhysReg RegNo, int64_t Imm, const MCSymbol *Target,
MCContext *Ctx) const {
llvm_unreachable("not implemented");
return {};
}
/// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
/// (dest + n) instead of dest.
virtual std::vector<MCInst> createInlineMemcpy(bool ReturnEnd) const {

View File

@ -1553,5 +1553,143 @@ void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
}
}
bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
if (!BinaryFunctionPass::shouldOptimize(Function))
return false;
for (auto &FunctionSpec : Spec) {
auto FunctionName = StringRef(FunctionSpec).split(':').first;
if (Function.hasName(FunctionName))
return true;
if (Function.hasNameRegex(FunctionName))
return true;
}
return false;
}
std::set<size_t>
SpecializeMemcpy1::getCallSitesToOptimize(const BinaryFunction &Function) const{
StringRef SitesString;
for (auto &FunctionSpec : Spec) {
StringRef FunctionName;
std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
if (Function.hasName(FunctionName))
break;
if (Function.hasNameRegex(FunctionName))
break;
SitesString = "";
}
std::set<size_t> Sites;
SmallVector<StringRef, 4> SitesVec;
SitesString.split(SitesVec, ':');
for (auto SiteString : SitesVec) {
if (SiteString.empty())
continue;
size_t Result;
if (!SiteString.getAsInteger(10, Result))
Sites.emplace(Result);
}
return Sites;
}
void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
if (!BC.isX86())
return;
uint64_t NumSpecialized = 0;
uint64_t NumSpecializedDyno = 0;
for (auto &BFI : BC.getBinaryFunctions()) {
auto &Function = BFI.second;
if (!shouldOptimize(Function))
continue;
auto CallsToOptimize = getCallSitesToOptimize(Function);
auto shouldOptimize = [&](size_t N) {
return CallsToOptimize.empty() || CallsToOptimize.count(N);
};
std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
size_t CallSiteID = 0;
for (auto *CurBB : Blocks) {
for(auto II = CurBB->begin(); II != CurBB->end(); ++II) {
auto &Inst = *II;
if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
!Inst.getOperand(0).isExpr())
continue;
const auto *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
if (CalleeSymbol->getName() != "memcpy" &&
CalleeSymbol->getName() != "memcpy@PLT")
continue;
if (BC.MIB->isTailCall(Inst))
continue;
++CallSiteID;
if (!shouldOptimize(CallSiteID))
continue;
// Create a copy of a call to memcpy(dest, src, size).
auto MemcpyInstr = Inst;
auto *OneByteMemcpyBB = CurBB->splitAt(II);
BinaryBasicBlock *NextBB{nullptr};
if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
NextBB->eraseInstruction(NextBB->begin());
} else {
NextBB = OneByteMemcpyBB->getSuccessor();
OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
assert(NextBB && "unexpected call to memcpy() with no return");
}
auto *MemcpyBB = Function.addBasicBlock(0);
auto CmpJCC = BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2),
1,
OneByteMemcpyBB->getLabel(),
BC.Ctx.get());
CurBB->addInstructions(CmpJCC);
CurBB->addSuccessor(MemcpyBB);
MemcpyBB->addInstruction(std::move(MemcpyInstr));
MemcpyBB->addSuccessor(NextBB);
MemcpyBB->setCFIState(NextBB->getCFIState());
MemcpyBB->setExecutionCount(0);
// To prevent the actual call from being moved to cold, we set its
// execution count to 1.
if (CurBB->getKnownExecutionCount() > 0)
MemcpyBB->setExecutionCount(1);
auto OneByteMemcpy = BC.MIB->createOneByteMemcpy();
OneByteMemcpyBB->addInstructions(OneByteMemcpy);
++NumSpecialized;
NumSpecializedDyno += CurBB->getKnownExecutionCount();
CurBB = NextBB;
// Note: we don't expect the next instruction to be a call to memcpy.
II = CurBB->begin();
}
}
}
if (NumSpecialized) {
outs() << "BOLT-INFO: specialized " << NumSpecialized
<< " memcpy() call sites for size 1";
if (NumSpecializedDyno)
outs() << ". The calls were executed " << NumSpecializedDyno
<< " times based on profile.";
outs() << '\n';
}
}
} // namespace bolt
} // namespace llvm

View File

@ -39,7 +39,7 @@ protected:
/// Control whether a specific function should be skipped during
/// optimization.
bool shouldOptimize(const BinaryFunction &BF) const;
virtual bool shouldOptimize(const BinaryFunction &BF) const;
public:
virtual ~BinaryFunctionPass() = default;
@ -403,6 +403,29 @@ public:
void runOnFunctions(BinaryContext &BC) override;
};
/// Pass for specializing memcpy for a size of 1 byte.
class SpecializeMemcpy1 : public BinaryFunctionPass {
private:
std::vector<std::string> Spec;
/// Return indices of the call sites to optimize. Count starts at 1.
/// Returns an empty set for all call sites in the function.
std::set<size_t> getCallSitesToOptimize(const BinaryFunction &) const;
public:
explicit SpecializeMemcpy1(const cl::opt<bool> &PrintPass,
cl::list<std::string> &Spec)
: BinaryFunctionPass(PrintPass), Spec(Spec) {}
bool shouldOptimize(const BinaryFunction &BF) const override;
const char *getName() const override {
return "specialize-memcpy";
}
void runOnFunctions(BinaryContext &BC) override;
};
enum FrameOptimizationType : char {
FOP_NONE, /// Don't perform FOP.
FOP_HOT, /// Perform FOP on hot functions.

View File

@ -196,19 +196,19 @@ std::string createRetpolineFunctionTag(BinaryContext &BC,
Ostream.flush();
}
Tag += MemRef.BaseRegNum != BC.MIB->getX86NoRegister()
Tag += MemRef.BaseRegNum != BC.MIB->getNoRegister()
? "r" + to_string(MemRef.BaseRegNum)
: "";
Tag +=
MemRef.DispExpr ? "+" + DispExprStr : "+" + to_string(MemRef.DispValue);
Tag += MemRef.IndexRegNum != BC.MIB->getX86NoRegister()
Tag += MemRef.IndexRegNum != BC.MIB->getNoRegister()
? "+" + to_string(MemRef.ScaleValue) + "*" +
to_string(MemRef.IndexRegNum)
: "";
Tag += MemRef.SegRegNum != BC.MIB->getX86NoRegister()
Tag += MemRef.SegRegNum != BC.MIB->getNoRegister()
? "_seg_" + to_string(MemRef.SegRegNum)
: "";

View File

@ -2759,6 +2759,43 @@ public:
return Code;
}
std::vector<MCInst> createOneByteMemcpy() const override {
std::vector<MCInst> Code;
Code.emplace_back(MCInstBuilder(X86::MOV8rm)
.addReg(X86::CL)
.addReg(X86::RSI)
.addImm(0)
.addReg(X86::NoRegister)
.addImm(0)
.addReg(X86::NoRegister));
Code.emplace_back(MCInstBuilder(X86::MOV8mr)
.addReg(X86::RDI)
.addImm(0)
.addReg(X86::NoRegister)
.addImm(0)
.addReg(X86::NoRegister)
.addReg(X86::CL));
Code.emplace_back(MCInstBuilder(X86::MOV64rr)
.addReg(X86::RAX)
.addReg(X86::RDI));
return Code;
}
std::vector<MCInst>
createCmpJE(MCPhysReg RegNo, int64_t Imm, const MCSymbol *Target,
MCContext *Ctx) const override {
std::vector<MCInst> Code;
Code.emplace_back(MCInstBuilder(X86::CMP64ri8)
.addReg(RegNo)
.addImm(Imm));
Code.emplace_back(MCInstBuilder(X86::JE_1)
.addExpr(MCSymbolRefExpr::create(
Target,
MCSymbolRefExpr::VK_None,
*Ctx)));
return Code;
}
bool replaceImmWithSymbol(MCInst &Inst, MCSymbol *Symbol, int64_t Addend,
MCContext *Ctx, int64_t &Value,
uint64_t RelType) const override {
@ -2903,10 +2940,23 @@ public:
return X86::R11;
}
MCPhysReg getX86NoRegister() const override {
MCPhysReg getNoRegister() const override {
return X86::NoRegister;
}
MCPhysReg getIntArgRegister(unsigned ArgNo) const override {
// FIXME: this should depend on the calling convention.
switch (ArgNo) {
case 0: return X86::RDI;
case 1: return X86::RSI;
case 2: return X86::RDX;
case 3: return X86::RCX;
case 4: return X86::R8;
case 5: return X86::R9;
default: return getNoRegister();
}
}
void createPause(MCInst &Inst) const override {
Inst.clear();
Inst.setOpcode(X86::PAUSE);