forked from OSchip/llvm-project
[BOLT] Add an option to specialize memcpy() for 1 byte copy
Summary: Add an option: -memcpy1-spec=func1,func2:cs1,func3:cs1:cs2,... to specialize calls to memcpy() in listed functions (the name could be supplied in regex) for size 1. The optimization will dynamically check if the size argument equals to 1 and execute a one byte copy, otherwise it will call memcpy() as usual. Specific call sites could be indicated after ":" using their numeric count from the start of the function. (cherry picked from FBD15428936)
This commit is contained in:
parent
ca659e4336
commit
d047df12c5
|
@ -456,6 +456,13 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
/// Add a range of instructions to the end of this basic block.
|
||||
template <typename RangeTy>
|
||||
void addInstructions(RangeTy R) {
|
||||
for(auto &I : R)
|
||||
addInstruction(I);
|
||||
}
|
||||
|
||||
/// Add instruction before Pos in this basic block.
|
||||
template <typename Itr>
|
||||
Itr insertPseudoInstr(Itr Pos, MCInst &Instr) {
|
||||
|
|
|
@ -683,6 +683,10 @@ public:
|
|||
return iterator_range<const_iterator>(begin(), end());
|
||||
}
|
||||
|
||||
// Iterators by pointer.
|
||||
BasicBlockListType::iterator pbegin() { return BasicBlocks.begin(); }
|
||||
BasicBlockListType::iterator pend() { return BasicBlocks.end(); }
|
||||
|
||||
order_iterator layout_begin() { return BasicBlocksLayout.begin(); }
|
||||
const_order_iterator layout_begin() const
|
||||
{ return BasicBlocksLayout.begin(); }
|
||||
|
|
|
@ -228,6 +228,14 @@ StringOps("inline-memcpy",
|
|||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::list<std::string>
|
||||
SpecializeMemcpy1("memcpy1-spec",
|
||||
cl::desc("list of functions with call sites for which to specialize memcpy() "
|
||||
"for size 1"),
|
||||
cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
StripRepRet("strip-rep-ret",
|
||||
cl::desc("strip 'repz' prefix from 'repz retq' sequence (on by default)"),
|
||||
|
@ -370,7 +378,12 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
|
|||
opts::ICF);
|
||||
|
||||
if (BC.isAArch64())
|
||||
Manager.registerPass(llvm::make_unique<VeneerElimination>(PrintVeneerElimination));
|
||||
Manager.registerPass(
|
||||
llvm::make_unique<VeneerElimination>(PrintVeneerElimination));
|
||||
|
||||
Manager.registerPass(
|
||||
llvm::make_unique<SpecializeMemcpy1>(NeverPrint, opts::SpecializeMemcpy1),
|
||||
!opts::SpecializeMemcpy1.empty());
|
||||
|
||||
Manager.registerPass(llvm::make_unique<InlineMemcpy>(NeverPrint),
|
||||
opts::StringOps);
|
||||
|
|
|
@ -368,7 +368,18 @@ public:
|
|||
llvm_unreachable("not implemented");
|
||||
}
|
||||
|
||||
virtual MCPhysReg getX86NoRegister() const {
|
||||
/// Return a register number that is guaranteed to not match with
|
||||
/// any real register on the underlying architecture.
|
||||
virtual MCPhysReg getNoRegister() const {
|
||||
llvm_unreachable("not implemented");
|
||||
}
|
||||
|
||||
/// Return a register corresponding to a function integer argument \p ArgNo
|
||||
/// if the argument is passed in a register. Or return the result of
|
||||
/// getNoRegister() otherwise. The enumeration starts at 0.
|
||||
///
|
||||
/// Note: this should depend on a used calling convention.
|
||||
virtual MCPhysReg getIntArgRegister(unsigned ArgNo) const {
|
||||
llvm_unreachable("not implemented");
|
||||
}
|
||||
|
||||
|
@ -1359,6 +1370,21 @@ public:
|
|||
return true;
|
||||
}
|
||||
|
||||
/// Create an inline version of memcpy(dest, src, 1).
|
||||
virtual std::vector<MCInst> createOneByteMemcpy() const {
|
||||
llvm_unreachable("not implemented");
|
||||
return {};
|
||||
}
|
||||
|
||||
/// Create a sequence of instructions to compare contents of a register
|
||||
/// \p RegNo to immediate \Imm and jump to \p Target if they are equal.
|
||||
virtual std::vector<MCInst>
|
||||
createCmpJE(MCPhysReg RegNo, int64_t Imm, const MCSymbol *Target,
|
||||
MCContext *Ctx) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return {};
|
||||
}
|
||||
|
||||
/// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
|
||||
/// (dest + n) instead of dest.
|
||||
virtual std::vector<MCInst> createInlineMemcpy(bool ReturnEnd) const {
|
||||
|
|
|
@ -1553,5 +1553,143 @@ void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
|
|||
}
|
||||
}
|
||||
|
||||
bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
|
||||
if (!BinaryFunctionPass::shouldOptimize(Function))
|
||||
return false;
|
||||
|
||||
for (auto &FunctionSpec : Spec) {
|
||||
auto FunctionName = StringRef(FunctionSpec).split(':').first;
|
||||
if (Function.hasName(FunctionName))
|
||||
return true;
|
||||
if (Function.hasNameRegex(FunctionName))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
std::set<size_t>
|
||||
SpecializeMemcpy1::getCallSitesToOptimize(const BinaryFunction &Function) const{
|
||||
StringRef SitesString;
|
||||
for (auto &FunctionSpec : Spec) {
|
||||
StringRef FunctionName;
|
||||
std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
|
||||
if (Function.hasName(FunctionName))
|
||||
break;
|
||||
if (Function.hasNameRegex(FunctionName))
|
||||
break;
|
||||
SitesString = "";
|
||||
}
|
||||
|
||||
std::set<size_t> Sites;
|
||||
SmallVector<StringRef, 4> SitesVec;
|
||||
SitesString.split(SitesVec, ':');
|
||||
for (auto SiteString : SitesVec) {
|
||||
if (SiteString.empty())
|
||||
continue;
|
||||
size_t Result;
|
||||
if (!SiteString.getAsInteger(10, Result))
|
||||
Sites.emplace(Result);
|
||||
}
|
||||
|
||||
return Sites;
|
||||
}
|
||||
|
||||
void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
|
||||
if (!BC.isX86())
|
||||
return;
|
||||
|
||||
uint64_t NumSpecialized = 0;
|
||||
uint64_t NumSpecializedDyno = 0;
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
auto &Function = BFI.second;
|
||||
if (!shouldOptimize(Function))
|
||||
continue;
|
||||
|
||||
auto CallsToOptimize = getCallSitesToOptimize(Function);
|
||||
auto shouldOptimize = [&](size_t N) {
|
||||
return CallsToOptimize.empty() || CallsToOptimize.count(N);
|
||||
};
|
||||
|
||||
std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
|
||||
size_t CallSiteID = 0;
|
||||
for (auto *CurBB : Blocks) {
|
||||
for(auto II = CurBB->begin(); II != CurBB->end(); ++II) {
|
||||
auto &Inst = *II;
|
||||
|
||||
if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
|
||||
!Inst.getOperand(0).isExpr())
|
||||
continue;
|
||||
|
||||
const auto *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
|
||||
if (CalleeSymbol->getName() != "memcpy" &&
|
||||
CalleeSymbol->getName() != "memcpy@PLT")
|
||||
continue;
|
||||
|
||||
if (BC.MIB->isTailCall(Inst))
|
||||
continue;
|
||||
|
||||
++CallSiteID;
|
||||
|
||||
if (!shouldOptimize(CallSiteID))
|
||||
continue;
|
||||
|
||||
// Create a copy of a call to memcpy(dest, src, size).
|
||||
auto MemcpyInstr = Inst;
|
||||
|
||||
auto *OneByteMemcpyBB = CurBB->splitAt(II);
|
||||
|
||||
BinaryBasicBlock *NextBB{nullptr};
|
||||
if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
|
||||
NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
|
||||
NextBB->eraseInstruction(NextBB->begin());
|
||||
} else {
|
||||
NextBB = OneByteMemcpyBB->getSuccessor();
|
||||
OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
|
||||
assert(NextBB && "unexpected call to memcpy() with no return");
|
||||
}
|
||||
|
||||
auto *MemcpyBB = Function.addBasicBlock(0);
|
||||
auto CmpJCC = BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2),
|
||||
1,
|
||||
OneByteMemcpyBB->getLabel(),
|
||||
BC.Ctx.get());
|
||||
CurBB->addInstructions(CmpJCC);
|
||||
CurBB->addSuccessor(MemcpyBB);
|
||||
|
||||
MemcpyBB->addInstruction(std::move(MemcpyInstr));
|
||||
MemcpyBB->addSuccessor(NextBB);
|
||||
MemcpyBB->setCFIState(NextBB->getCFIState());
|
||||
MemcpyBB->setExecutionCount(0);
|
||||
|
||||
// To prevent the actual call from being moved to cold, we set its
|
||||
// execution count to 1.
|
||||
if (CurBB->getKnownExecutionCount() > 0)
|
||||
MemcpyBB->setExecutionCount(1);
|
||||
|
||||
auto OneByteMemcpy = BC.MIB->createOneByteMemcpy();
|
||||
OneByteMemcpyBB->addInstructions(OneByteMemcpy);
|
||||
|
||||
++NumSpecialized;
|
||||
NumSpecializedDyno += CurBB->getKnownExecutionCount();
|
||||
|
||||
CurBB = NextBB;
|
||||
|
||||
// Note: we don't expect the next instruction to be a call to memcpy.
|
||||
II = CurBB->begin();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (NumSpecialized) {
|
||||
outs() << "BOLT-INFO: specialized " << NumSpecialized
|
||||
<< " memcpy() call sites for size 1";
|
||||
if (NumSpecializedDyno)
|
||||
outs() << ". The calls were executed " << NumSpecializedDyno
|
||||
<< " times based on profile.";
|
||||
outs() << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
|
|
@ -39,7 +39,7 @@ protected:
|
|||
|
||||
/// Control whether a specific function should be skipped during
|
||||
/// optimization.
|
||||
bool shouldOptimize(const BinaryFunction &BF) const;
|
||||
virtual bool shouldOptimize(const BinaryFunction &BF) const;
|
||||
public:
|
||||
virtual ~BinaryFunctionPass() = default;
|
||||
|
||||
|
@ -403,6 +403,29 @@ public:
|
|||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
/// Pass for specializing memcpy for a size of 1 byte.
|
||||
class SpecializeMemcpy1 : public BinaryFunctionPass {
|
||||
private:
|
||||
std::vector<std::string> Spec;
|
||||
|
||||
/// Return indices of the call sites to optimize. Count starts at 1.
|
||||
/// Returns an empty set for all call sites in the function.
|
||||
std::set<size_t> getCallSitesToOptimize(const BinaryFunction &) const;
|
||||
|
||||
public:
|
||||
explicit SpecializeMemcpy1(const cl::opt<bool> &PrintPass,
|
||||
cl::list<std::string> &Spec)
|
||||
: BinaryFunctionPass(PrintPass), Spec(Spec) {}
|
||||
|
||||
bool shouldOptimize(const BinaryFunction &BF) const override;
|
||||
|
||||
const char *getName() const override {
|
||||
return "specialize-memcpy";
|
||||
}
|
||||
|
||||
void runOnFunctions(BinaryContext &BC) override;
|
||||
};
|
||||
|
||||
enum FrameOptimizationType : char {
|
||||
FOP_NONE, /// Don't perform FOP.
|
||||
FOP_HOT, /// Perform FOP on hot functions.
|
||||
|
|
|
@ -196,19 +196,19 @@ std::string createRetpolineFunctionTag(BinaryContext &BC,
|
|||
Ostream.flush();
|
||||
}
|
||||
|
||||
Tag += MemRef.BaseRegNum != BC.MIB->getX86NoRegister()
|
||||
Tag += MemRef.BaseRegNum != BC.MIB->getNoRegister()
|
||||
? "r" + to_string(MemRef.BaseRegNum)
|
||||
: "";
|
||||
|
||||
Tag +=
|
||||
MemRef.DispExpr ? "+" + DispExprStr : "+" + to_string(MemRef.DispValue);
|
||||
|
||||
Tag += MemRef.IndexRegNum != BC.MIB->getX86NoRegister()
|
||||
Tag += MemRef.IndexRegNum != BC.MIB->getNoRegister()
|
||||
? "+" + to_string(MemRef.ScaleValue) + "*" +
|
||||
to_string(MemRef.IndexRegNum)
|
||||
: "";
|
||||
|
||||
Tag += MemRef.SegRegNum != BC.MIB->getX86NoRegister()
|
||||
Tag += MemRef.SegRegNum != BC.MIB->getNoRegister()
|
||||
? "_seg_" + to_string(MemRef.SegRegNum)
|
||||
: "";
|
||||
|
||||
|
|
|
@ -2759,6 +2759,43 @@ public:
|
|||
return Code;
|
||||
}
|
||||
|
||||
std::vector<MCInst> createOneByteMemcpy() const override {
|
||||
std::vector<MCInst> Code;
|
||||
Code.emplace_back(MCInstBuilder(X86::MOV8rm)
|
||||
.addReg(X86::CL)
|
||||
.addReg(X86::RSI)
|
||||
.addImm(0)
|
||||
.addReg(X86::NoRegister)
|
||||
.addImm(0)
|
||||
.addReg(X86::NoRegister));
|
||||
Code.emplace_back(MCInstBuilder(X86::MOV8mr)
|
||||
.addReg(X86::RDI)
|
||||
.addImm(0)
|
||||
.addReg(X86::NoRegister)
|
||||
.addImm(0)
|
||||
.addReg(X86::NoRegister)
|
||||
.addReg(X86::CL));
|
||||
Code.emplace_back(MCInstBuilder(X86::MOV64rr)
|
||||
.addReg(X86::RAX)
|
||||
.addReg(X86::RDI));
|
||||
return Code;
|
||||
}
|
||||
|
||||
std::vector<MCInst>
|
||||
createCmpJE(MCPhysReg RegNo, int64_t Imm, const MCSymbol *Target,
|
||||
MCContext *Ctx) const override {
|
||||
std::vector<MCInst> Code;
|
||||
Code.emplace_back(MCInstBuilder(X86::CMP64ri8)
|
||||
.addReg(RegNo)
|
||||
.addImm(Imm));
|
||||
Code.emplace_back(MCInstBuilder(X86::JE_1)
|
||||
.addExpr(MCSymbolRefExpr::create(
|
||||
Target,
|
||||
MCSymbolRefExpr::VK_None,
|
||||
*Ctx)));
|
||||
return Code;
|
||||
}
|
||||
|
||||
bool replaceImmWithSymbol(MCInst &Inst, MCSymbol *Symbol, int64_t Addend,
|
||||
MCContext *Ctx, int64_t &Value,
|
||||
uint64_t RelType) const override {
|
||||
|
@ -2903,10 +2940,23 @@ public:
|
|||
return X86::R11;
|
||||
}
|
||||
|
||||
MCPhysReg getX86NoRegister() const override {
|
||||
MCPhysReg getNoRegister() const override {
|
||||
return X86::NoRegister;
|
||||
}
|
||||
|
||||
MCPhysReg getIntArgRegister(unsigned ArgNo) const override {
|
||||
// FIXME: this should depend on the calling convention.
|
||||
switch (ArgNo) {
|
||||
case 0: return X86::RDI;
|
||||
case 1: return X86::RSI;
|
||||
case 2: return X86::RDX;
|
||||
case 3: return X86::RCX;
|
||||
case 4: return X86::R8;
|
||||
case 5: return X86::R9;
|
||||
default: return getNoRegister();
|
||||
}
|
||||
}
|
||||
|
||||
void createPause(MCInst &Inst) const override {
|
||||
Inst.clear();
|
||||
Inst.setOpcode(X86::PAUSE);
|
||||
|
|
Loading…
Reference in New Issue