From 76d346ca14d412560c464310cdb23931ffed0eed Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Tue, 9 Mar 2021 16:18:11 -0800 Subject: [PATCH] [BOLT][PR] Instrumentation: Introduce -no-counters-clear and -wait-forks options Summary: This PR introduces 2 new instrumentation options: 1. instrumentation-no-counters-clear: Discussed at https://github.com/facebookincubator/BOLT/issues/121 2. instrumentation-wait-forks: Since the instrumentation counters are mapped as MAP_SHARED it will be nice to add ability to wait until all forks of the parent process will die using tracking of process group. The last patch is just emitBinary code refactor. Vladislav Khmelevsky, Advanced Software Technology Lab, Huawei Pull Request resolved: https://github.com/facebookincubator/BOLT/pull/125 GitHub Author: Vladislav Khmelevskyi (cherry picked from FBD26919011) --- bolt/runtime/common.h | 40 ++++++ bolt/runtime/instr.cpp | 40 +++++- bolt/src/Passes/Instrumentation.cpp | 12 ++ .../InstrumentationRuntimeLibrary.cpp | 125 ++++++++---------- 4 files changed, 144 insertions(+), 73 deletions(-) diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h index 2ea082d4d139..206dfbc18ae7 100644 --- a/bolt/runtime/common.h +++ b/bolt/runtime/common.h @@ -1,3 +1,13 @@ +//===-- common.h ------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + #if !defined(__APPLE__) #include @@ -333,6 +343,36 @@ uint64_t __getppid() { return ret; } +int __setpgid(uint64_t pid, uint64_t pgid) { + int ret; + __asm__ __volatile__("movq $109, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid), "S"(pgid) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __getpgid(uint64_t pid) { + uint64_t ret; + __asm__ __volatile__("movq $121, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __kill(uint64_t pid, int sig) { + int ret; + __asm__ __volatile__("movq $62, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid), "S"(sig) + : "cc", "rcx", "r11", "memory"); + return ret; +} + #endif void reportError(const char *Msg, uint64_t Size) { diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp index 72bbc6aa74a4..c2bbbf5c9840 100644 --- a/bolt/runtime/instr.cpp +++ b/bolt/runtime/instr.cpp @@ -84,6 +84,10 @@ extern uint32_t __bolt_instr_num_ind_targets; extern uint32_t __bolt_instr_num_funcs; // Time to sleep across dumps (when we write the fdata profile to disk) extern uint32_t __bolt_instr_sleep_time; +// Do not clear counters across dumps, rewrite file with the updated values +extern bool __bolt_instr_no_counters_clear; +// Wait until all forks of instrumented process will finish +extern bool __bolt_instr_wait_forks; // Filename to dump data to extern char __bolt_instr_filename[]; // If true, append current PID to the fdata filename when creating it so @@ -1402,23 +1406,43 @@ extern "C" void __bolt_instr_data_dump() { void watchProcess() { timespec ts, rem; uint64_t Ellapsed = 0ull; + uint64_t ppid; + if (__bolt_instr_wait_forks) { + // Store parent pgid + ppid = -__getpgid(0); + // And leave parent process group + __setpgid(0, 0); + } else { + // Store parent pid + ppid = __getppid(); + if (ppid == 1) { + // Parent already dead + goto out; + } + } + ts.tv_sec = 1; ts.tv_nsec = 0; while (1) { __nanosleep(&ts, &rem); - // This means our parent process died, so no need for us to keep dumping. - // Notice that make and some systems will wait until all child processes - // of a command finishes before proceeding, so it is important to exit as - // early as possible once our parent dies. - if (__getppid() == 1) { + // This means our parent process or all its forks are dead, + // so no need for us to keep dumping. + if (__kill(ppid, 0) < 0) { + if (__bolt_instr_no_counters_clear) + __bolt_instr_data_dump(); break; } + if (++Ellapsed < __bolt_instr_sleep_time) continue; + Ellapsed = 0; __bolt_instr_data_dump(); - __bolt_instr_clear_counters(); + if (__bolt_instr_no_counters_clear == false) + __bolt_instr_clear_counters(); } + +out:; DEBUG(report("My parent process is dead, bye!\n")); __exit(0); } @@ -1453,6 +1477,10 @@ extern "C" void __bolt_instr_setup() { new (GlobalAlloc, 0) IndirectCallHashTable[__bolt_instr_num_ind_calls]; if (__bolt_instr_sleep_time != 0) { + // Separate instrumented process to the own process group + if (__bolt_instr_wait_forks) + __setpgid(0, 0); + if (auto PID = __fork()) return; watchProcess(); diff --git a/bolt/src/Passes/Instrumentation.cpp b/bolt/src/Passes/Instrumentation.cpp index 7489ec41e1f4..082df9e967b7 100644 --- a/bolt/src/Passes/Instrumentation.cpp +++ b/bolt/src/Passes/Instrumentation.cpp @@ -49,6 +49,18 @@ cl::opt InstrumentationSleepTime( "program and the profile is not being dumped at the end."), cl::init(0), cl::Optional, cl::cat(BoltInstrCategory)); +cl::opt InstrumentationNoCountersClear( + "instrumentation-no-counters-clear", + cl::desc("Don't clear counters across dumps " + "(use with instrumentation-sleep-time option)"), + cl::init(false), cl::Optional, cl::cat(BoltInstrCategory)); + +cl::opt InstrumentationWaitForks( + "instrumentation-wait-forks", + cl::desc("Wait until all forks of instrumented process will finish " + "(use with instrumentation-sleep-time option)"), + cl::init(false), cl::Optional, cl::cat(BoltInstrCategory)); + cl::opt InstrumentHotOnly("instrument-hot-only", cl::desc("only insert instrumentation on hot functions " diff --git a/bolt/src/RuntimeLibs/InstrumentationRuntimeLibrary.cpp b/bolt/src/RuntimeLibs/InstrumentationRuntimeLibrary.cpp index f5a22c4e7f7a..e9219d509a35 100644 --- a/bolt/src/RuntimeLibs/InstrumentationRuntimeLibrary.cpp +++ b/bolt/src/RuntimeLibs/InstrumentationRuntimeLibrary.cpp @@ -24,6 +24,8 @@ extern cl::OptionCategory BoltOptCategory; extern cl::opt InstrumentationFileAppendPID; extern cl::opt InstrumentationFilename; extern cl::opt InstrumentationSleepTime; +extern cl::opt InstrumentationNoCountersClear; +extern cl::opt InstrumentationWaitForks; cl::opt Instrument("instrument", @@ -93,31 +95,43 @@ void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC, "__BOLT", "__counters", MachO::S_REGULAR, SectionKind::getData())); + Section->setAlignment(llvm::Align(BC.RegularPageSize)); + Streamer.SwitchSection(Section); + + auto EmitLabel = [&](MCSymbol *Symbol, bool IsGlobal = true) { + Streamer.emitLabel(Symbol); + if (IsGlobal) + Streamer.emitSymbolAttribute(Symbol, MCSymbolAttr::MCSA_Global); + }; + + auto EmitLabelByName = [&](StringRef Name, bool IsGlobal = true) { + MCSymbol *Symbol = BC.Ctx->getOrCreateSymbol(Name); + EmitLabel(Symbol, IsGlobal); + }; + + auto EmitValue = [&](MCSymbol *Symbol, const MCExpr *Value) { + EmitLabel(Symbol); + Streamer.emitValue(Value, /*Size*/ 8); + }; + + auto EmitIntValue = [&](StringRef Name, uint64_t Value, unsigned Size = 4) { + EmitLabelByName(Name); + Streamer.emitIntValue(Value, Size); + }; + + auto EmitString = [&](StringRef Name, StringRef Contents) { + EmitLabelByName(Name); + Streamer.emitBytes(Contents); + Streamer.emitFill(1, 0); + }; + // All of the following symbols will be exported as globals to be used by the // instrumentation runtime library to dump the instrumentation data to disk. // Label marking start of the memory region containing instrumentation // counters, total vector size is Counters.size() 8-byte counters - MCSymbol *Locs = BC.Ctx->getOrCreateSymbol("__bolt_instr_locations"); - MCSymbol *NumLocs = BC.Ctx->getOrCreateSymbol("__bolt_num_counters"); - MCSymbol *NumIndCalls = - BC.Ctx->getOrCreateSymbol("__bolt_instr_num_ind_calls"); - MCSymbol *NumIndCallTargets = - BC.Ctx->getOrCreateSymbol("__bolt_instr_num_ind_targets"); - MCSymbol *NumFuncs = BC.Ctx->getOrCreateSymbol("__bolt_instr_num_funcs"); - /// File name where profile is going to written to after target binary - /// finishes a run - MCSymbol *FilenameSym = BC.Ctx->getOrCreateSymbol("__bolt_instr_filename"); - MCSymbol *UsePIDSym = BC.Ctx->getOrCreateSymbol("__bolt_instr_use_pid"); - MCSymbol *InitPtr = BC.Ctx->getOrCreateSymbol("__bolt_instr_init_ptr"); - MCSymbol *FiniPtr = BC.Ctx->getOrCreateSymbol("__bolt_instr_fini_ptr"); - MCSymbol *SleepSym = BC.Ctx->getOrCreateSymbol("__bolt_instr_sleep_time"); - - Section->setAlignment(llvm::Align(BC.RegularPageSize)); - Streamer.SwitchSection(Section); - Streamer.emitLabel(Locs); - Streamer.emitSymbolAttribute(Locs, MCSymbolAttr::MCSA_Global); + EmitLabelByName("__bolt_instr_locations"); for (const auto &Label : Summary->Counters) { - Streamer.emitLabel(Label); + EmitLabel(Label, /*IsGlobal*/ false); Streamer.emitFill(8, 0); } const uint64_t Padding = @@ -125,63 +139,40 @@ void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC, 8 * Summary->Counters.size(); if (Padding) Streamer.emitFill(Padding, 0); - Streamer.emitLabel(SleepSym); - Streamer.emitSymbolAttribute(SleepSym, MCSymbolAttr::MCSA_Global); - Streamer.emitIntValue(opts::InstrumentationSleepTime, /*Size=*/4); - Streamer.emitLabel(NumLocs); - Streamer.emitSymbolAttribute(NumLocs, MCSymbolAttr::MCSA_Global); - Streamer.emitIntValue(Summary->Counters.size(), /*Size=*/4); - Streamer.emitLabel(Summary->IndCallHandlerFunc); - Streamer.emitSymbolAttribute(Summary->IndCallHandlerFunc, - MCSymbolAttr::MCSA_Global); - Streamer.emitValue( - MCSymbolRefExpr::create( - Summary->InitialIndCallHandlerFunction->getSymbol(), *BC.Ctx), - /*Size=*/8); - Streamer.emitLabel(Summary->IndTailCallHandlerFunc); - Streamer.emitSymbolAttribute(Summary->IndTailCallHandlerFunc, - MCSymbolAttr::MCSA_Global); - Streamer.emitValue( - MCSymbolRefExpr::create( - Summary->InitialIndTailCallHandlerFunction->getSymbol(), *BC.Ctx), - /*Size=*/8); - Streamer.emitLabel(NumIndCalls); - Streamer.emitSymbolAttribute(NumIndCalls, MCSymbolAttr::MCSA_Global); - Streamer.emitIntValue(Summary->IndCallDescriptions.size(), /*Size=*/4); - Streamer.emitLabel(NumIndCallTargets); - Streamer.emitSymbolAttribute(NumIndCallTargets, MCSymbolAttr::MCSA_Global); - Streamer.emitIntValue(Summary->IndCallTargetDescriptions.size(), /*Size=*/4); - Streamer.emitLabel(NumFuncs); - Streamer.emitSymbolAttribute(NumFuncs, MCSymbolAttr::MCSA_Global); - Streamer.emitIntValue(Summary->FunctionDescriptions.size(), /*Size=*/4); - Streamer.emitLabel(FilenameSym); - Streamer.emitBytes(opts::InstrumentationFilename); - Streamer.emitFill(1, 0); - Streamer.emitLabel(UsePIDSym); - Streamer.emitIntValue(opts::InstrumentationFileAppendPID ? 1 : 0, /*Size=*/1); - - Streamer.emitLabel(InitPtr); - Streamer.emitSymbolAttribute(InitPtr, MCSymbolAttr::MCSA_Global); - Streamer.emitValue( - MCSymbolRefExpr::create(StartFunction->getSymbol(), *BC.Ctx), /*Size=*/8); + EmitIntValue("__bolt_instr_sleep_time", opts::InstrumentationSleepTime); + EmitIntValue("__bolt_instr_no_counters_clear", + !!opts::InstrumentationNoCountersClear, 1); + EmitIntValue("__bolt_instr_wait_forks", !!opts::InstrumentationWaitForks, 1); + EmitIntValue("__bolt_num_counters", Summary->Counters.size()); + EmitValue(Summary->IndCallHandlerFunc, + MCSymbolRefExpr::create( + Summary->InitialIndCallHandlerFunction->getSymbol(), *BC.Ctx)); + EmitValue( + Summary->IndTailCallHandlerFunc, + MCSymbolRefExpr::create( + Summary->InitialIndTailCallHandlerFunction->getSymbol(), *BC.Ctx)); + EmitIntValue("__bolt_instr_num_ind_calls", + Summary->IndCallDescriptions.size()); + EmitIntValue("__bolt_instr_num_ind_targets", + Summary->IndCallTargetDescriptions.size()); + EmitIntValue("__bolt_instr_num_funcs", Summary->FunctionDescriptions.size()); + EmitString("__bolt_instr_filename", opts::InstrumentationFilename); + EmitIntValue("__bolt_instr_use_pid", !!opts::InstrumentationFileAppendPID, 1); + EmitValue(BC.Ctx->getOrCreateSymbol("__bolt_instr_init_ptr"), + MCSymbolRefExpr::create(StartFunction->getSymbol(), *BC.Ctx)); if (FiniFunction) { - Streamer.emitLabel(FiniPtr); - Streamer.emitSymbolAttribute(FiniPtr, MCSymbolAttr::MCSA_Global); - Streamer.emitValue( - MCSymbolRefExpr::create(FiniFunction->getSymbol(), *BC.Ctx), /*Size=*/8); + EmitValue(BC.Ctx->getOrCreateSymbol("__bolt_instr_fini_ptr"), + MCSymbolRefExpr::create(FiniFunction->getSymbol(), *BC.Ctx)); } if (BC.isMachO()) { MCSection *TablesSection = BC.Ctx->getMachOSection( "__BOLT", "__tables", MachO::S_REGULAR, SectionKind::getData()); - MCSymbol *Tables = BC.Ctx->getOrCreateSymbol("__bolt_instr_tables"); TablesSection->setAlignment(llvm::Align(BC.RegularPageSize)); Streamer.SwitchSection(TablesSection); - Streamer.emitLabel(Tables); - Streamer.emitSymbolAttribute(Tables, MCSymbolAttr::MCSA_Global); - Streamer.emitBytes(buildTables(BC)); + EmitString("__bolt_instr_tables", buildTables(BC)); } }