[BOLT][PR] Instrumentation: Introduce -no-counters-clear and -wait-forks options

Summary:
This PR introduces 2 new instrumentation options:
1. instrumentation-no-counters-clear: Discussed at https://github.com/facebookincubator/BOLT/issues/121
2. instrumentation-wait-forks: Since the instrumentation counters are mapped as MAP_SHARED it will be nice to add ability to wait until all forks of the parent process will die using tracking of process group.
The last patch is just emitBinary code refactor.
Vladislav Khmelevsky,
Advanced Software Technology Lab, Huawei

Pull Request resolved: https://github.com/facebookincubator/BOLT/pull/125
GitHub Author: Vladislav Khmelevskyi <Vladislav.Khmelevskyi@huawei.com>

(cherry picked from FBD26919011)
This commit is contained in:
Vladislav Khmelevsky 2021-03-09 16:18:11 -08:00 committed by Maksim Panchenko
parent 225a8d7f2c
commit 76d346ca14
4 changed files with 144 additions and 73 deletions

View File

@ -1,3 +1,13 @@
//===-- common.h ------------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#if !defined(__APPLE__)
#include <cstddef>
@ -333,6 +343,36 @@ uint64_t __getppid() {
return ret;
}
int __setpgid(uint64_t pid, uint64_t pgid) {
int ret;
__asm__ __volatile__("movq $109, %%rax\n"
"syscall\n"
: "=a"(ret)
: "D"(pid), "S"(pgid)
: "cc", "rcx", "r11", "memory");
return ret;
}
uint64_t __getpgid(uint64_t pid) {
uint64_t ret;
__asm__ __volatile__("movq $121, %%rax\n"
"syscall\n"
: "=a"(ret)
: "D"(pid)
: "cc", "rcx", "r11", "memory");
return ret;
}
int __kill(uint64_t pid, int sig) {
int ret;
__asm__ __volatile__("movq $62, %%rax\n"
"syscall\n"
: "=a"(ret)
: "D"(pid), "S"(sig)
: "cc", "rcx", "r11", "memory");
return ret;
}
#endif
void reportError(const char *Msg, uint64_t Size) {

View File

@ -84,6 +84,10 @@ extern uint32_t __bolt_instr_num_ind_targets;
extern uint32_t __bolt_instr_num_funcs;
// Time to sleep across dumps (when we write the fdata profile to disk)
extern uint32_t __bolt_instr_sleep_time;
// Do not clear counters across dumps, rewrite file with the updated values
extern bool __bolt_instr_no_counters_clear;
// Wait until all forks of instrumented process will finish
extern bool __bolt_instr_wait_forks;
// Filename to dump data to
extern char __bolt_instr_filename[];
// If true, append current PID to the fdata filename when creating it so
@ -1402,23 +1406,43 @@ extern "C" void __bolt_instr_data_dump() {
void watchProcess() {
timespec ts, rem;
uint64_t Ellapsed = 0ull;
uint64_t ppid;
if (__bolt_instr_wait_forks) {
// Store parent pgid
ppid = -__getpgid(0);
// And leave parent process group
__setpgid(0, 0);
} else {
// Store parent pid
ppid = __getppid();
if (ppid == 1) {
// Parent already dead
goto out;
}
}
ts.tv_sec = 1;
ts.tv_nsec = 0;
while (1) {
__nanosleep(&ts, &rem);
// This means our parent process died, so no need for us to keep dumping.
// Notice that make and some systems will wait until all child processes
// of a command finishes before proceeding, so it is important to exit as
// early as possible once our parent dies.
if (__getppid() == 1) {
// This means our parent process or all its forks are dead,
// so no need for us to keep dumping.
if (__kill(ppid, 0) < 0) {
if (__bolt_instr_no_counters_clear)
__bolt_instr_data_dump();
break;
}
if (++Ellapsed < __bolt_instr_sleep_time)
continue;
Ellapsed = 0;
__bolt_instr_data_dump();
__bolt_instr_clear_counters();
if (__bolt_instr_no_counters_clear == false)
__bolt_instr_clear_counters();
}
out:;
DEBUG(report("My parent process is dead, bye!\n"));
__exit(0);
}
@ -1453,6 +1477,10 @@ extern "C" void __bolt_instr_setup() {
new (GlobalAlloc, 0) IndirectCallHashTable[__bolt_instr_num_ind_calls];
if (__bolt_instr_sleep_time != 0) {
// Separate instrumented process to the own process group
if (__bolt_instr_wait_forks)
__setpgid(0, 0);
if (auto PID = __fork())
return;
watchProcess();

View File

@ -49,6 +49,18 @@ cl::opt<uint32_t> InstrumentationSleepTime(
"program and the profile is not being dumped at the end."),
cl::init(0), cl::Optional, cl::cat(BoltInstrCategory));
cl::opt<bool> InstrumentationNoCountersClear(
"instrumentation-no-counters-clear",
cl::desc("Don't clear counters across dumps "
"(use with instrumentation-sleep-time option)"),
cl::init(false), cl::Optional, cl::cat(BoltInstrCategory));
cl::opt<bool> InstrumentationWaitForks(
"instrumentation-wait-forks",
cl::desc("Wait until all forks of instrumented process will finish "
"(use with instrumentation-sleep-time option)"),
cl::init(false), cl::Optional, cl::cat(BoltInstrCategory));
cl::opt<bool>
InstrumentHotOnly("instrument-hot-only",
cl::desc("only insert instrumentation on hot functions "

View File

@ -24,6 +24,8 @@ extern cl::OptionCategory BoltOptCategory;
extern cl::opt<bool> InstrumentationFileAppendPID;
extern cl::opt<std::string> InstrumentationFilename;
extern cl::opt<uint32_t> InstrumentationSleepTime;
extern cl::opt<bool> InstrumentationNoCountersClear;
extern cl::opt<bool> InstrumentationWaitForks;
cl::opt<bool>
Instrument("instrument",
@ -93,31 +95,43 @@ void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC,
"__BOLT", "__counters", MachO::S_REGULAR,
SectionKind::getData()));
Section->setAlignment(llvm::Align(BC.RegularPageSize));
Streamer.SwitchSection(Section);
auto EmitLabel = [&](MCSymbol *Symbol, bool IsGlobal = true) {
Streamer.emitLabel(Symbol);
if (IsGlobal)
Streamer.emitSymbolAttribute(Symbol, MCSymbolAttr::MCSA_Global);
};
auto EmitLabelByName = [&](StringRef Name, bool IsGlobal = true) {
MCSymbol *Symbol = BC.Ctx->getOrCreateSymbol(Name);
EmitLabel(Symbol, IsGlobal);
};
auto EmitValue = [&](MCSymbol *Symbol, const MCExpr *Value) {
EmitLabel(Symbol);
Streamer.emitValue(Value, /*Size*/ 8);
};
auto EmitIntValue = [&](StringRef Name, uint64_t Value, unsigned Size = 4) {
EmitLabelByName(Name);
Streamer.emitIntValue(Value, Size);
};
auto EmitString = [&](StringRef Name, StringRef Contents) {
EmitLabelByName(Name);
Streamer.emitBytes(Contents);
Streamer.emitFill(1, 0);
};
// All of the following symbols will be exported as globals to be used by the
// instrumentation runtime library to dump the instrumentation data to disk.
// Label marking start of the memory region containing instrumentation
// counters, total vector size is Counters.size() 8-byte counters
MCSymbol *Locs = BC.Ctx->getOrCreateSymbol("__bolt_instr_locations");
MCSymbol *NumLocs = BC.Ctx->getOrCreateSymbol("__bolt_num_counters");
MCSymbol *NumIndCalls =
BC.Ctx->getOrCreateSymbol("__bolt_instr_num_ind_calls");
MCSymbol *NumIndCallTargets =
BC.Ctx->getOrCreateSymbol("__bolt_instr_num_ind_targets");
MCSymbol *NumFuncs = BC.Ctx->getOrCreateSymbol("__bolt_instr_num_funcs");
/// File name where profile is going to written to after target binary
/// finishes a run
MCSymbol *FilenameSym = BC.Ctx->getOrCreateSymbol("__bolt_instr_filename");
MCSymbol *UsePIDSym = BC.Ctx->getOrCreateSymbol("__bolt_instr_use_pid");
MCSymbol *InitPtr = BC.Ctx->getOrCreateSymbol("__bolt_instr_init_ptr");
MCSymbol *FiniPtr = BC.Ctx->getOrCreateSymbol("__bolt_instr_fini_ptr");
MCSymbol *SleepSym = BC.Ctx->getOrCreateSymbol("__bolt_instr_sleep_time");
Section->setAlignment(llvm::Align(BC.RegularPageSize));
Streamer.SwitchSection(Section);
Streamer.emitLabel(Locs);
Streamer.emitSymbolAttribute(Locs, MCSymbolAttr::MCSA_Global);
EmitLabelByName("__bolt_instr_locations");
for (const auto &Label : Summary->Counters) {
Streamer.emitLabel(Label);
EmitLabel(Label, /*IsGlobal*/ false);
Streamer.emitFill(8, 0);
}
const uint64_t Padding =
@ -125,63 +139,40 @@ void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC,
8 * Summary->Counters.size();
if (Padding)
Streamer.emitFill(Padding, 0);
Streamer.emitLabel(SleepSym);
Streamer.emitSymbolAttribute(SleepSym, MCSymbolAttr::MCSA_Global);
Streamer.emitIntValue(opts::InstrumentationSleepTime, /*Size=*/4);
Streamer.emitLabel(NumLocs);
Streamer.emitSymbolAttribute(NumLocs, MCSymbolAttr::MCSA_Global);
Streamer.emitIntValue(Summary->Counters.size(), /*Size=*/4);
Streamer.emitLabel(Summary->IndCallHandlerFunc);
Streamer.emitSymbolAttribute(Summary->IndCallHandlerFunc,
MCSymbolAttr::MCSA_Global);
Streamer.emitValue(
MCSymbolRefExpr::create(
Summary->InitialIndCallHandlerFunction->getSymbol(), *BC.Ctx),
/*Size=*/8);
Streamer.emitLabel(Summary->IndTailCallHandlerFunc);
Streamer.emitSymbolAttribute(Summary->IndTailCallHandlerFunc,
MCSymbolAttr::MCSA_Global);
Streamer.emitValue(
MCSymbolRefExpr::create(
Summary->InitialIndTailCallHandlerFunction->getSymbol(), *BC.Ctx),
/*Size=*/8);
Streamer.emitLabel(NumIndCalls);
Streamer.emitSymbolAttribute(NumIndCalls, MCSymbolAttr::MCSA_Global);
Streamer.emitIntValue(Summary->IndCallDescriptions.size(), /*Size=*/4);
Streamer.emitLabel(NumIndCallTargets);
Streamer.emitSymbolAttribute(NumIndCallTargets, MCSymbolAttr::MCSA_Global);
Streamer.emitIntValue(Summary->IndCallTargetDescriptions.size(), /*Size=*/4);
Streamer.emitLabel(NumFuncs);
Streamer.emitSymbolAttribute(NumFuncs, MCSymbolAttr::MCSA_Global);
Streamer.emitIntValue(Summary->FunctionDescriptions.size(), /*Size=*/4);
Streamer.emitLabel(FilenameSym);
Streamer.emitBytes(opts::InstrumentationFilename);
Streamer.emitFill(1, 0);
Streamer.emitLabel(UsePIDSym);
Streamer.emitIntValue(opts::InstrumentationFileAppendPID ? 1 : 0, /*Size=*/1);
Streamer.emitLabel(InitPtr);
Streamer.emitSymbolAttribute(InitPtr, MCSymbolAttr::MCSA_Global);
Streamer.emitValue(
MCSymbolRefExpr::create(StartFunction->getSymbol(), *BC.Ctx), /*Size=*/8);
EmitIntValue("__bolt_instr_sleep_time", opts::InstrumentationSleepTime);
EmitIntValue("__bolt_instr_no_counters_clear",
!!opts::InstrumentationNoCountersClear, 1);
EmitIntValue("__bolt_instr_wait_forks", !!opts::InstrumentationWaitForks, 1);
EmitIntValue("__bolt_num_counters", Summary->Counters.size());
EmitValue(Summary->IndCallHandlerFunc,
MCSymbolRefExpr::create(
Summary->InitialIndCallHandlerFunction->getSymbol(), *BC.Ctx));
EmitValue(
Summary->IndTailCallHandlerFunc,
MCSymbolRefExpr::create(
Summary->InitialIndTailCallHandlerFunction->getSymbol(), *BC.Ctx));
EmitIntValue("__bolt_instr_num_ind_calls",
Summary->IndCallDescriptions.size());
EmitIntValue("__bolt_instr_num_ind_targets",
Summary->IndCallTargetDescriptions.size());
EmitIntValue("__bolt_instr_num_funcs", Summary->FunctionDescriptions.size());
EmitString("__bolt_instr_filename", opts::InstrumentationFilename);
EmitIntValue("__bolt_instr_use_pid", !!opts::InstrumentationFileAppendPID, 1);
EmitValue(BC.Ctx->getOrCreateSymbol("__bolt_instr_init_ptr"),
MCSymbolRefExpr::create(StartFunction->getSymbol(), *BC.Ctx));
if (FiniFunction) {
Streamer.emitLabel(FiniPtr);
Streamer.emitSymbolAttribute(FiniPtr, MCSymbolAttr::MCSA_Global);
Streamer.emitValue(
MCSymbolRefExpr::create(FiniFunction->getSymbol(), *BC.Ctx), /*Size=*/8);
EmitValue(BC.Ctx->getOrCreateSymbol("__bolt_instr_fini_ptr"),
MCSymbolRefExpr::create(FiniFunction->getSymbol(), *BC.Ctx));
}
if (BC.isMachO()) {
MCSection *TablesSection = BC.Ctx->getMachOSection(
"__BOLT", "__tables", MachO::S_REGULAR,
SectionKind::getData());
MCSymbol *Tables = BC.Ctx->getOrCreateSymbol("__bolt_instr_tables");
TablesSection->setAlignment(llvm::Align(BC.RegularPageSize));
Streamer.SwitchSection(TablesSection);
Streamer.emitLabel(Tables);
Streamer.emitSymbolAttribute(Tables, MCSymbolAttr::MCSA_Global);
Streamer.emitBytes(buildTables(BC));
EmitString("__bolt_instr_tables", buildTables(BC));
}
}