diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index b3d9328f6a24..8bb3919b18e1 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -97,11 +97,12 @@ bool BinaryBasicBlock::validateSuccessorInvariants() { case 0: Valid = !CondBranch && !UncondBranch; break; - case 1: - Valid = !CondBranch || - (CondBranch && - !Function->getBasicBlockForLabel(BC.MIA->getTargetSymbol(*CondBranch))); + case 1: { + const bool HasCondBlock = CondBranch && + Function->getBasicBlockForLabel(BC.MIA->getTargetSymbol(*CondBranch)); + Valid = !CondBranch || !HasCondBlock; break; + } case 2: Valid = (CondBranch && @@ -121,7 +122,7 @@ bool BinaryBasicBlock::validateSuccessorInvariants() { << Twine::utohexstr(BC.MIA->getJumpTable(*Inst)) << "\n"; JT->print(errs()); } - dump(); + getFunction()->dump(); } return Valid; } @@ -452,5 +453,18 @@ uint64_t BinaryBasicBlock::estimateSize() const { return Function->getBinaryContext().computeCodeSize(begin(), end()); } +BinaryBasicBlock::BinaryBranchInfo & +BinaryBasicBlock::getBranchInfo(const BinaryBasicBlock &Succ) { + auto BI = branch_info_begin(); + for (auto BB : successors()) { + if (&Succ == BB) + return *BI; + ++BI; + } + + llvm_unreachable("Invalid successor"); + return *BI; +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 49949c9263c0..52db09c8a8ed 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -363,15 +363,14 @@ public: return BranchInfo[Condition == true ? 0 : 1]; }; - BinaryBranchInfo &getBranchInfo(const BinaryBasicBlock &Succ) { - auto BI = branch_info_begin(); - for (auto BB : successors()) { - if (&Succ == BB) - return *BI; - ++BI; - } - llvm_unreachable("Invalid successor"); - return *BI; + BinaryBranchInfo &getBranchInfo(const BinaryBasicBlock &Succ); + + void setSuccessorBranchInfo(const BinaryBasicBlock &Succ, + uint64_t Count, + uint64_t MispredictedCount) { + auto &BI = getBranchInfo(Succ); + BI.Count = Count; + BI.MispredictedCount = MispredictedCount; } /// Try to compute the taken and misprediction frequencies for the given diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 82de83bd8e7b..d1f38138bb09 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -27,8 +27,6 @@ namespace opts { extern cl::OptionCategory BoltCategory; -extern cl::opt ReorderFunctions; - static cl::opt PrintDebugInfo("print-debug-info", cl::desc("print debug info when printing functions"), @@ -215,16 +213,14 @@ std::vector BinaryContext::getSortedFunctions( return &BFI.second; }); - if (opts::ReorderFunctions != BinaryFunction::RT_NONE) { - std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), - [](const BinaryFunction *A, const BinaryFunction *B) { - if (A->hasValidIndex() && B->hasValidIndex()) { - return A->getIndex() < B->getIndex(); - } else { - return A->hasValidIndex(); - } - }); - } + std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), + [](const BinaryFunction *A, const BinaryFunction *B) { + if (A->hasValidIndex() && B->hasValidIndex()) { + return A->getIndex() < B->getIndex(); + } else { + return A->hasValidIndex(); + } + }); return SortedFunctions; } diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 65804f0b5ab4..ad4909e9f013 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -169,6 +169,9 @@ public: /// Number of functions with profile information uint64_t NumProfiledFuncs{0}; + /// Total hotness score according to profiling data for this binary. + uint64_t TotalScore{0}; + /// Track next available address for new allocatable sections. RewriteInstance /// sets this prior to running BOLT passes, so layout passes are aware of the /// final addresses functions will have. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 04615a4898ce..a6c75fea32b2 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -13,7 +13,6 @@ #include "BinaryBasicBlock.h" #include "BinaryFunction.h" #include "DataReader.h" -#include "Passes/MCF.h" #include "llvm/ADT/edit_distance.h" #include "llvm/ADT/StringRef.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" @@ -52,7 +51,6 @@ extern cl::OptionCategory BoltRelocCategory; extern bool shouldProcess(const BinaryFunction &); extern cl::opt UpdateDebugSections; -extern cl::opt IndirectCallPromotion; extern cl::opt Verbosity; static cl::opt @@ -61,27 +59,6 @@ AlignBlocks("align-blocks", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt -DoMCF("mcf", - cl::desc("solve a min cost flow problem on the CFG to fix edge counts " - "(default=disable)"), - cl::init(MCF_DISABLE), - cl::values( - clEnumValN(MCF_DISABLE, "none", - "disable MCF"), - clEnumValN(MCF_LINEAR, "linear", - "cost function is inversely proportional to edge count"), - clEnumValN(MCF_QUADRATIC, "quadratic", - "cost function is inversely proportional to edge count squared"), - clEnumValN(MCF_LOG, "log", - "cost function is inversely proportional to log of edge count"), - clEnumValN(MCF_BLAMEFTS, "blamefts", - "tune cost to blame fall-through edges for surplus flow"), - clEnumValEnd), - cl::ZeroOrMore, - cl::Hidden, - cl::cat(BoltOptCategory)); - static cl::opt DotToolTipCode("dot-tooltip-code", cl::desc("add basic block instructions as tool tips on nodes"), @@ -1185,21 +1162,13 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } BC.MIA->replaceBranchTarget(Instruction, TargetSymbol, &*Ctx); - // Record call offset for profile matching. - if (IsCall) { - MIA->addAnnotation(Ctx.get(), Instruction, "Offset", Offset); - } - if (IsCondBranch) { - // Add fallthrough branch info. - FTBranches.emplace_back(Offset, Offset + Size); - if (IsCall) { - MIA->setConditionalTailCall(Instruction, TargetAddress); - } + // Mark CTC. + if (IsCondBranch && IsCall) { + MIA->setConditionalTailCall(Instruction, TargetAddress); } } else { // Could not evaluate branch. Should be an indirect call or an // indirect branch. Bail out on the latter case. - MIA->addAnnotation(Ctx.get(), Instruction, "Offset", Offset); if (MIA->isIndirectBranch(Instruction)) { auto Result = processIndirectBranch(Instruction, Size, Offset); switch (Result) { @@ -1255,6 +1224,9 @@ add_instruction: findDebugLineInformationForInstructionAt(AbsoluteInstrAddr, ULT)); } + // Record offset of the instruction for profile matching. + MIA->addAnnotation(Ctx.get(), Instruction, "Offset", Offset); + if (MemData && !emptyRange(MemData->getMemInfoRange(Offset))) { MIA->addAnnotation(Ctx.get(), Instruction, "MemDataOffset", Offset); } @@ -1563,9 +1535,6 @@ bool BinaryFunction::buildCFG() { // e.g. exit(3), etc. Otherwise we'll see a false fall-through // blocks. - // Possibly assign/re-assign branch profile data. - matchProfileData(); - for (auto &Branch : TakenBranches) { DEBUG(dbgs() << "registering branch [0x" << Twine::utohexstr(Branch.first) << "] -> [0x" << Twine::utohexstr(Branch.second) << "]\n"); @@ -1574,124 +1543,15 @@ bool BinaryFunction::buildCFG() { auto *ToBB = getBasicBlockAtOffset(Branch.second); assert(ToBB && "cannot find BB containing TO branch"); - if (!BranchData) { - FromBB->addSuccessor(ToBB); - continue; - } - - auto BranchInfoOrErr = BranchData->getBranch(Branch.first, Branch.second); - if (!BranchInfoOrErr) { - FromBB->addSuccessor(ToBB); - continue; - } - - const BranchInfo &BInfo = BranchInfoOrErr.get(); - FromBB->addSuccessor(ToBB, BInfo.Branches, BInfo.Mispreds); - - // Populate profile counts for the jump table. - auto *LastInstr = FromBB->getLastNonPseudoInstr(); - if (!LastInstr) - continue; - auto JTAddress = BC.MIA->getJumpTable(*LastInstr); - if (!JTAddress) - continue; - auto *JT = getJumpTableContainingAddress(JTAddress); - if (!JT) - continue; - JT->Count += BInfo.Branches; - if (opts::IndirectCallPromotion < ICP_JUMP_TABLES && - opts::JumpTables < JTS_AGGRESSIVE) - continue; - if (JT->Counts.empty()) - JT->Counts.resize(JT->Entries.size()); - auto EI = JT->Entries.begin(); - auto Delta = (JTAddress - JT->Address) / JT->EntrySize; - EI += Delta; - while (EI != JT->Entries.end()) { - if (ToBB->getLabel() == *EI) { - assert(Delta < JT->Counts.size()); - JT->Counts[Delta].Mispreds += BInfo.Mispreds; - JT->Counts[Delta].Count += BInfo.Branches; - } - ++Delta; - ++EI; - // A label marks the start of another jump table. - if (JT->Labels.count(Delta * JT->EntrySize)) - break; - } + FromBB->addSuccessor(ToBB); } - for (auto &Branch : FTBranches) { - DEBUG(dbgs() << "registering fallthrough [0x" - << Twine::utohexstr(Branch.first) << "] -> [0x" - << Twine::utohexstr(Branch.second) << "]\n"); - auto *FromBB = getBasicBlockContainingOffset(Branch.first); - assert(FromBB && "cannot find BB containing FROM branch"); - // Try to find the destination basic block. If the jump instruction was - // followed by a no-op then the destination offset recorded in FTBranches - // will point to that no-op but the destination basic block will start - // after the no-op due to ignoring no-ops when creating basic blocks. - // So we have to skip any no-ops when trying to find the destination - // basic block. - auto *ToBB = getBasicBlockAtOffset(Branch.second); - if (ToBB == nullptr) { - auto I = Instructions.find(Branch.second), E = Instructions.end(); - while (ToBB == nullptr && I != E && MIA->isNoop(I->second)) { - ++I; - if (I == E) - break; - ToBB = getBasicBlockAtOffset(I->first); - } - if (ToBB == nullptr) { - // We have a fall-through that does not point to another BB, ignore it - // as it may happen in cases where we have a BB finished by two - // branches. - // This can also happen when we delete a branch past the end of a - // function in case of a call to __builtin_unreachable(). - continue; - } - } - - // Does not add a successor if we can't find profile data, leave it to the - // inference pass to guess its frequency - if (BranchData) { - auto BranchInfoOrErr = BranchData->getBranch(Branch.first, Branch.second); - if (BranchInfoOrErr) { - const BranchInfo &BInfo = BranchInfoOrErr.get(); - FromBB->addSuccessor(ToBB, BInfo.Branches, BInfo.Mispreds); - } - } - } - - if (BranchData) { - for (auto BB : BasicBlocks) { - auto *CTCInstr = BB->getLastNonPseudoInstr(); - if (!CTCInstr || !MIA->getConditionalTailCall(*CTCInstr)) - continue; - - auto OffsetOrErr = - MIA->tryGetAnnotationAs(*CTCInstr, "Offset"); - assert(OffsetOrErr && "offset not set for conditional tail call"); - - auto BranchInfoOrErr = BranchData->getDirectCallBranch(*OffsetOrErr); - if (!BranchInfoOrErr) - continue; - - MIA->addAnnotation(BC.Ctx.get(), *CTCInstr, "CTCTakenCount", - BranchInfoOrErr->Branches); - MIA->addAnnotation(BC.Ctx.get(), *CTCInstr, "CTCMispredCount", - BranchInfoOrErr->Mispreds); - } - } - - // Add fall-through branches (except for non-taken conditional branches with - // profile data, which were already accounted for in TakenBranches). + // Add fall-through branches. PrevBB = nullptr; bool IsPrevFT = false; // Is previous block a fall-through. for (auto BB : BasicBlocks) { if (IsPrevFT) { - PrevBB->addSuccessor(BB, BinaryBasicBlock::COUNT_NO_PROFILE, - BinaryBasicBlock::COUNT_INFERRED); + PrevBB->addSuccessor(BB); } if (BB->empty()) { IsPrevFT = true; @@ -1703,29 +1563,18 @@ bool BinaryFunction::buildCFG() { assert(LastInstr && "should have non-pseudo instruction in non-empty block"); - const auto IsCondTailCall = MIA->getConditionalTailCall(*LastInstr); if (BB->succ_size() == 0) { - if (IsCondTailCall) { - // Conditional tail call without profile data for non-taken branch. - IsPrevFT = true; - } else { - // Unless the last instruction is a terminator, control will fall - // through to the next basic block. - IsPrevFT = !MIA->isTerminator(*LastInstr); - } + // Since there's no existing successors, we know the last instruction is + // not a conditional branch. Thus if it's a terminator, it shouldn't be a + // fall-through. + // + // Conditional tail call is a special case since we don't add a taken + // branch successor for it. + IsPrevFT = !MIA->isTerminator(*LastInstr) || + MIA->getConditionalTailCall(*LastInstr); } else if (BB->succ_size() == 1) { - if (IsCondTailCall) { - // Conditional tail call with data for non-taken branch. A fall-through - // edge has already ben added in the CFG. - IsPrevFT = false; - } else { - // Fall-through should be added if the last instruction is a conditional - // jump, since there was no profile data for the non-taken branch. - IsPrevFT = MIA->isConditionalBranch(*LastInstr); - } + IsPrevFT = MIA->isConditionalBranch(*LastInstr); } else { - // Ends with 2 branches, with an indirect jump or it is a conditional - // branch whose frequency has been inferred from LBR. IsPrevFT = false; } @@ -1734,26 +1583,20 @@ bool BinaryFunction::buildCFG() { if (!IsPrevFT) { // Possibly a call that does not return. - DEBUG(dbgs() << "last block was marked as a fall-through\n"); + DEBUG(dbgs() << "last block was marked as a fall-through in " << *this + << '\n'); } + // Assign landing pads and throwers info. recomputeLandingPads(); - // Infer frequency for non-taken branches - if (hasValidProfile() && opts::DoMCF != MCF_DISABLE) { - // Convert COUNT_NO_PROFILE to 0 - removeTagsFromProfile(); - solveMCF(*this, opts::DoMCF); - } else if (hasValidProfile()) { - inferFallThroughCounts(); - } else { - clearProfile(); - } - // Assign CFI information to each BB entry. annotateCFIState(); - // Set the basic block layout to the original order. + // Annotate invoke instructions with GNU_args_size data. + propagateGnuArgsSizeInfo(); + + // Set the basic block layout to the original order and set end offsets. PrevBB = nullptr; for (auto BB : BasicBlocks) { BasicBlocksLayout.emplace_back(BB); @@ -1763,33 +1606,37 @@ bool BinaryFunction::buildCFG() { } PrevBB->setEndOffset(getSize()); - // Convert conditional tail call branches to conditional branches that jump - // to a tail call. - // TODO: make a separate pass - removeConditionalTailCalls(); + updateLayoutIndices(); - // Make any necessary adjustments for indirect branches. - if (!postProcessIndirectBranches()) { - if (opts::Verbosity) { - errs() << "BOLT-WARNING: failed to post-process indirect branches for " - << *this << '\n'; + // Update the state. + CurrentState = State::CFG; + + return true; +} + +void BinaryFunction::postProcessCFG() { + if (isSimple() && !BasicBlocks.empty()) { + // Convert conditional tail call branches to conditional branches that jump + // to a tail call. + removeConditionalTailCalls(); + + // Make any necessary adjustments for indirect branches. + if (!postProcessIndirectBranches()) { + if (opts::Verbosity) { + errs() << "BOLT-WARNING: failed to post-process indirect branches for " + << *this << '\n'; + } + // In relocation mode we want to keep processing the function but avoid + // optimizing it. + setSimple(false); + } else { + postProcessProfile(); + + // Eliminate inconsistencies between branch instructions and CFG. + postProcessBranches(); } - // In relocation mode we want to keep processing the function but avoid - // optimizing it. - setSimple(false); } - // Eliminate inconsistencies between branch instructions and CFG. - postProcessBranches(); - - // If our profiling data comes from samples instead of LBR entries, - // now is the time to read this data and attach it to BBs. At this point, - // conditional tail calls are converted into a branch and a new basic block, - // making it slightly different than the original binary where profiled data - // was collected. However, this shouldn't matter for plain sampling events. - if (!BC.DR.hasLBR()) - readSampleData(); - // Clean-up memory taken by instructions and labels. // // NB: don't clear Labels list as we may need them if we mark the function @@ -1797,19 +1644,20 @@ bool BinaryFunction::buildCFG() { clearList(Instructions); clearList(OffsetToCFI); clearList(TakenBranches); - clearList(FTBranches); clearList(IgnoredBranches); clearList(EntryOffsets); - // Update the state. - CurrentState = State::CFG; + // Remove "Offset" annotations from instructions that don't need those. + for (auto *BB : layout()) { + for (auto &Inst : *BB) { + if (BC.MIA->isCall(Inst) || BC.MIA->isIndirectBranch(Inst)) + continue; + BC.MIA->removeAnnotation(Inst, "Offset"); + } + } - // Annotate invoke instructions with GNU_args_size data. - propagateGnuArgsSizeInfo(); - - assert(validateCFG() && "Invalid CFG detected after disassembly"); - - return true; + assert((!isSimple() || validateCFG()) + && "Invalid CFG detected after post-processing CFG"); } void BinaryFunction::removeTagsFromProfile() { @@ -1826,57 +1674,6 @@ void BinaryFunction::removeTagsFromProfile() { } } -void BinaryFunction::readSampleData() { - auto SampleDataOrErr = BC.DR.getFuncSampleData(getNames()); - - if (!SampleDataOrErr) - return; - - // Non-LBR mode territory - // First step is to assign BB execution count based on samples from perf - ProfileMatchRatio = 1.0f; - removeTagsFromProfile(); - bool NormalizeByInsnCount = - BC.DR.usesEvent("cycles") || BC.DR.usesEvent("instructions"); - bool NormalizeByCalls = BC.DR.usesEvent("branches"); - static bool NagUser{true}; - if (NagUser) { - outs() << "BOLT-INFO: operating with non-LBR profiling data.\n"; - if (NormalizeByInsnCount) { - outs() << "BOLT-INFO: normalizing samples by instruction count.\n"; - } else if (NormalizeByCalls) { - outs() << "BOLT-INFO: normalizing samples by branches.\n"; - } - NagUser = false; - } - uint64_t LastOffset = getSize(); - uint64_t TotalEntryCount{0}; - for (auto I = BasicBlockOffsets.rbegin(), E = BasicBlockOffsets.rend(); - I != E; ++I) { - uint64_t CurOffset = I->first; - // Always work with samples multiplied by 1000 to avoid losing them if we - // later need to normalize numbers - uint64_t NumSamples = - SampleDataOrErr->getSamples(CurOffset, LastOffset) * 1000; - if (NormalizeByInsnCount && I->second->getNumNonPseudos()) - NumSamples /= I->second->getNumNonPseudos(); - else if (NormalizeByCalls) { - uint32_t NumCalls = I->second->getNumCalls(); - NumSamples /= NumCalls + 1; - } - I->second->setExecutionCount(NumSamples); - if (I->second->isEntryPoint()) - TotalEntryCount += NumSamples; - LastOffset = CurOffset; - } - ExecutionCount = TotalEntryCount; - - estimateEdgeCounts(BC, *this); - - if (opts::DoMCF != MCF_DISABLE) - solveMCF(*this, opts::DoMCF); -} - void BinaryFunction::addEntryPoint(uint64_t Address) { assert(containsAddress(Address) && "address does not belong to the function"); @@ -1930,377 +1727,7 @@ void BinaryFunction::addEntryPoint(uint64_t Address) { } } -bool BinaryFunction::fetchProfileForOtherEntryPoints() { - if (!BranchData) - return false; - - // Check if we are missing profiling data for secondary entry points - bool First{true}; - bool Updated{false}; - for (auto BB : BasicBlocks) { - if (First) { - First = false; - continue; - } - if (BB->isEntryPoint()) { - uint64_t EntryAddress = BB->getOffset() + getAddress(); - // Look for branch data associated with this entry point - std::vector Names; - std::multimap::iterator I, E; - for (std::tie(I, E) = BC.GlobalAddresses.equal_range(EntryAddress); - I != E; ++I) { - Names.push_back(I->second); - } - if (!Names.empty()) { - if (FuncBranchData *Data = BC.DR.getFuncBranchData(Names)) { - BranchData->appendFrom(*Data, BB->getOffset()); - Data->Used = true; - Updated = true; - } - } - } - } - return Updated; -} - -void BinaryFunction::matchProfileMemData() { - const auto AllMemData = BC.DR.getFuncMemDataRegex(getNames()); - for (auto *NewMemData : AllMemData) { - // Prevent functions from sharing the same profile. - if (NewMemData->Used) - continue; - - if (MemData) - MemData->Used = false; - - // Update function profile data with the new set. - MemData = NewMemData; - MemData->Used = true; - break; - } -} - -void BinaryFunction::matchProfileData() { - // This functionality is available for LBR-mode only - // TODO: Implement evaluateProfileData() for samples, checking whether - // sample addresses match instruction addresses in the function - if (!BC.DR.hasLBR()) - return; - - if (BranchData) { - ProfileMatchRatio = evaluateProfileData(*BranchData); - if (ProfileMatchRatio == 1.0f) { - if (fetchProfileForOtherEntryPoints()) { - ProfileMatchRatio = evaluateProfileData(*BranchData); - ExecutionCount = BranchData->ExecutionCount; - } - return; - } - } - - // Check if the function name can fluctuate between several compilations - // possibly triggered by minor unrelated code changes in the source code - // of the input binary. - const auto HasVolatileName = [this]() { - for (const auto Name : getNames()) { - if (getLTOCommonName(Name)) - return true; - } - return false; - }(); - if (!HasVolatileName) - return; - - // Check for a profile that matches with 100% confidence. - const auto AllBranchData = BC.DR.getFuncBranchDataRegex(getNames()); - for (auto *NewBranchData : AllBranchData) { - // Prevent functions from sharing the same profile. - if (NewBranchData->Used) - continue; - - if (evaluateProfileData(*NewBranchData) != 1.0f) - continue; - - if (BranchData) - BranchData->Used = false; - - // Update function profile data with the new set. - BranchData = NewBranchData; - ExecutionCount = NewBranchData->ExecutionCount; - ProfileMatchRatio = 1.0f; - BranchData->Used = true; - break; - } -} - -float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { - // Until we define a minimal profile, we consider an empty branch data to be - // a valid profile. It could happen to a function without branches when we - // still have an EntryData for execution count. - if (BranchData.Data.empty()) { - return 1.0f; - } - - BranchListType ProfileBranches(BranchData.Data.size()); - std::transform(BranchData.Data.begin(), - BranchData.Data.end(), - ProfileBranches.begin(), - [](const BranchInfo &BI) { - return std::make_pair(BI.From.Offset, - BI.To.Name == BI.From.Name ? - BI.To.Offset : -1U); - }); - BranchListType LocalProfileBranches; - std::copy_if(ProfileBranches.begin(), - ProfileBranches.end(), - std::back_inserter(LocalProfileBranches), - [](const std::pair &Branch) { - return Branch.second != -1U; - }); - - // Profile referencing external functions. - BranchListType ExternProfileBranches; - std::copy_if(ProfileBranches.begin(), - ProfileBranches.end(), - std::back_inserter(ExternProfileBranches), - [](const std::pair &Branch) { - return Branch.second == -1U; - }); - - std::sort(LocalProfileBranches.begin(), LocalProfileBranches.end()); - - BranchListType FunctionBranches = TakenBranches; - FunctionBranches.insert(FunctionBranches.end(), - FTBranches.begin(), - FTBranches.end()); - FunctionBranches.insert(FunctionBranches.end(), - IgnoredBranches.begin(), - IgnoredBranches.end()); - std::sort(FunctionBranches.begin(), FunctionBranches.end()); - - BranchListType DiffBranches; // Branches in profile without a match. - std::set_difference(LocalProfileBranches.begin(), - LocalProfileBranches.end(), - FunctionBranches.begin(), - FunctionBranches.end(), - std::back_inserter(DiffBranches)); - - // Branches without a match in CFG. - BranchListType OrphanBranches; - - // Eliminate recursive calls and returns from recursive calls from the list - // of branches that have no match. They are not considered local branches. - auto isRecursiveBranch = [&](std::pair &Branch) { - auto SrcInstrI = Instructions.find(Branch.first); - if (SrcInstrI == Instructions.end()) - return false; - - // Check if it is a recursive call. - const auto &SrcInstr = SrcInstrI->second; - if ((BC.MIA->isCall(SrcInstr) || BC.MIA->isIndirectBranch(SrcInstr)) && - Branch.second == 0) - return true; - - auto DstInstrI = Instructions.find(Branch.second); - if (DstInstrI == Instructions.end()) - return false; - - // Check if it is a return from a recursive call. - bool IsSrcReturn = BC.MIA->isReturn(SrcInstr); - // "rep ret" is considered to be 2 different instructions. - if (!IsSrcReturn && BC.MIA->isPrefix(SrcInstr)) { - auto SrcInstrSuccessorI = SrcInstrI; - ++SrcInstrSuccessorI; - assert(SrcInstrSuccessorI != Instructions.end() && - "unexpected prefix instruction at the end of function"); - IsSrcReturn = BC.MIA->isReturn(SrcInstrSuccessorI->second); - } - if (IsSrcReturn && Branch.second != 0) { - // Make sure the destination follows the call instruction. - auto DstInstrPredecessorI = DstInstrI; - --DstInstrPredecessorI; - assert(DstInstrPredecessorI != Instructions.end() && "invalid iterator"); - if (BC.MIA->isCall(DstInstrPredecessorI->second)) - return true; - } - return false; - }; - std::remove_copy_if(DiffBranches.begin(), - DiffBranches.end(), - std::back_inserter(OrphanBranches), - isRecursiveBranch); - - // Check all external branches. - std::copy_if(ExternProfileBranches.begin(), - ExternProfileBranches.end(), - std::back_inserter(OrphanBranches), - [&](const std::pair &Branch) { - auto II = Instructions.find(Branch.first); - if (II == Instructions.end()) - return true; - const auto &Instr = II->second; - // Check for calls, tail calls, rets and indirect branches. - // When matching profiling info, we did not reach the stage - // when we identify tail calls, so they are still represented - // by regular branch instructions and we need isBranch() here. - if (BC.MIA->isCall(Instr) || - BC.MIA->isBranch(Instr) || - BC.MIA->isReturn(Instr)) - return false; - // Check for "rep ret" - if (BC.MIA->isPrefix(Instr)) { - ++II; - if (II != Instructions.end() && BC.MIA->isReturn(II->second)) - return false; - } - return true; - }); - - const float MatchRatio = - (float) (ProfileBranches.size() - OrphanBranches.size()) / - (float) ProfileBranches.size(); - - if (opts::Verbosity >= 2 && !OrphanBranches.empty()) { - errs() << "BOLT-WARNING: profile branches match only " - << format("%.1f%%", MatchRatio * 100.0f) << " (" - << (ProfileBranches.size() - OrphanBranches.size()) << '/' - << ProfileBranches.size() << ") for function " - << *this << '\n'; - DEBUG( - for (auto &OBranch : OrphanBranches) - errs() << "\t0x" << Twine::utohexstr(OBranch.first) << " -> 0x" - << Twine::utohexstr(OBranch.second) << " (0x" - << Twine::utohexstr(OBranch.first + getAddress()) << " -> 0x" - << Twine::utohexstr(OBranch.second + getAddress()) << ")\n"; - ); - } - - return MatchRatio; -} - -void BinaryFunction::clearProfile() { - // Keep function execution profile the same. Only clear basic block and edge - // counts. - for (auto *BB : BasicBlocks) { - BB->ExecutionCount = 0; - for (auto &BI : BB->branch_info()) { - BI.Count = 0; - BI.MispredictedCount = 0; - } - } -} - - -void BinaryFunction::inferFallThroughCounts() { - assert(!BasicBlocks.empty() && "basic block list should not be empty"); - assert(BranchData && "cannot infer counts without branch data"); - - // Compute preliminary execution count for each basic block - for (auto CurBB : BasicBlocks) { - CurBB->ExecutionCount = 0; - } - - for (auto CurBB : BasicBlocks) { - auto SuccBIIter = CurBB->branch_info_begin(); - for (auto Succ : CurBB->successors()) { - if (SuccBIIter->Count != BinaryBasicBlock::COUNT_NO_PROFILE) - Succ->setExecutionCount(Succ->getExecutionCount() + SuccBIIter->Count); - ++SuccBIIter; - } - } - - // Set entry BBs to zero, we'll update their execution count next with entry - // data (we maintain a separate data structure for branches to function entry - // points) - for (auto BB : BasicBlocks) { - if (BB->isEntryPoint()) - BB->ExecutionCount = 0; - } - - // Update execution counts of landing pad blocks and entry BBs - // There is a slight skew introduced here as branches originated from RETs - // may be accounted for in the execution count of an entry block if the last - // instruction in a predecessor fall-through block is a call. This situation - // should rarely happen because there are few multiple-entry functions. - for (const auto &I : BranchData->EntryData) { - BinaryBasicBlock *BB = getBasicBlockAtOffset(I.To.Offset); - if (BB && (BB->isEntryPoint() || BB->isLandingPad())) { - BB->setExecutionCount(BB->getExecutionCount() + I.Branches); - } - } - - // Work on a basic block at a time, propagating frequency information - // forwards. - // It is important to walk in the layout order. - for (auto BB : BasicBlocks) { - uint64_t BBExecCount = BB->getExecutionCount(); - - // Propagate this information to successors, filling in fall-through edges - // with frequency information - if (BB->succ_size() == 0) - continue; - - // Calculate frequency of outgoing branches from this node according to - // LBR data. - uint64_t ReportedBranches = 0; - for (const auto &SuccBI : BB->branch_info()) { - if (SuccBI.Count != BinaryBasicBlock::COUNT_NO_PROFILE) - ReportedBranches += SuccBI.Count; - } - - // Get taken count of conditional tail call if the block ends with one. - uint64_t CTCTakenCount = 0; - const auto CTCInstr = BB->getLastNonPseudoInstr(); - if (CTCInstr && BC.MIA->getConditionalTailCall(*CTCInstr)) { - CTCTakenCount = - BC.MIA->getAnnotationWithDefault(*CTCInstr, "CTCTakenCount"); - } - - // Calculate frequency of throws from this node according to LBR data - // for branching into associated landing pads. Since it is possible - // for a landing pad to be associated with more than one basic blocks, - // we may overestimate the frequency of throws for such blocks. - uint64_t ReportedThrows = 0; - for (const auto *LP: BB->landing_pads()) { - ReportedThrows += LP->getExecutionCount(); - } - - const uint64_t TotalReportedJumps = - ReportedBranches + CTCTakenCount + ReportedThrows; - - // Infer the frequency of the fall-through edge, representing not taking the - // branch. - uint64_t Inferred = 0; - if (BBExecCount > TotalReportedJumps) - Inferred = BBExecCount - TotalReportedJumps; - - DEBUG( - if (opts::Verbosity >= 1 && BBExecCount < TotalReportedJumps) - errs() - << "BOLT-WARNING: Fall-through inference is slightly inconsistent. " - "exec frequency is less than the outgoing edges frequency (" - << BBExecCount << " < " << ReportedBranches - << ") for BB at offset 0x" - << Twine::utohexstr(getAddress() + BB->getOffset()) << '\n'; - ); - - if (BB->succ_size() <= 2) { - // If there is an FT it will be the last successor. - auto &SuccBI = *BB->branch_info_rbegin(); - auto &Succ = *BB->succ_rbegin(); - if (SuccBI.Count == BinaryBasicBlock::COUNT_NO_PROFILE) { - SuccBI.Count = Inferred; - Succ->ExecutionCount += Inferred; - } - } - } - - return; -} - void BinaryFunction::removeConditionalTailCalls() { - CurrentState = State::CFG; - // Blocks to be appended at the end. std::vector> NewBlocks; @@ -2373,6 +1800,9 @@ void BinaryFunction::removeConditionalTailCalls() { // Swap edges as the TailCallBB corresponds to the taken branch. BB.swapConditionalSuccessors(); } + + // This branch is no longer a conditional tail call. + BC.MIA->unsetConditionalTailCall(*CTCInstr); } insertBasicBlocks(std::prev(end()), @@ -3068,11 +2498,12 @@ void BinaryFunction::fixBranches() { // terminator) or more than 2 (switch table) don't require branch // instruction adjustments. } - assert(validateCFG() && "Invalid CFG detected after fixing branches"); + assert((!isSimple() || validateCFG()) + && "Invalid CFG detected after fixing branches"); } void BinaryFunction::propagateGnuArgsSizeInfo() { - assert(CurrentState == State::CFG && "unexpected function state"); + assert(CurrentState == State::Disassembled && "unexpected function state"); if (!hasEHRanges() || !usesGnuArgsSize()) return; @@ -3145,68 +2576,6 @@ void BinaryFunction::postProcessBranches() { assert(validateCFG() && "invalid CFG"); } -void BinaryFunction::mergeProfileDataInto(BinaryFunction &BF) const { - // No reason to merge invalid or empty profiles into BF. - if (!hasValidProfile()) - return; - - // Update function execution count. - if (getExecutionCount() != BinaryFunction::COUNT_NO_PROFILE) { - BF.setExecutionCount(BF.getKnownExecutionCount() + getExecutionCount()); - } - - // Since we are merging a valid profile, the new profile should be valid too. - // It has either already been valid, or it has been cleaned up. - BF.ProfileMatchRatio = 1.0f; - - // Update basic block and edge counts. - auto BBMergeI = BF.begin(); - for (BinaryBasicBlock *BB : BasicBlocks) { - BinaryBasicBlock *BBMerge = &*BBMergeI; - assert(getIndex(BB) == BF.getIndex(BBMerge)); - - // Update basic block count. - if (BB->getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE) { - BBMerge->setExecutionCount( - BBMerge->getKnownExecutionCount() + BB->getExecutionCount()); - } - - // Update edge count for successors of this basic block. - auto BBMergeSI = BBMerge->succ_begin(); - auto BIMergeI = BBMerge->branch_info_begin(); - auto BII = BB->branch_info_begin(); - for (const auto *BBSucc : BB->successors()) { - (void)BBSucc; - assert(getIndex(BBSucc) == BF.getIndex(*BBMergeSI)); - - // At this point no branch count should be set to COUNT_NO_PROFILE. - assert(BII->Count != BinaryBasicBlock::COUNT_NO_PROFILE && - "unexpected unknown branch profile"); - assert(BIMergeI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && - "unexpected unknown branch profile"); - - BIMergeI->Count += BII->Count; - - // When we merge inferred and real fall-through branch data, the merged - // data is considered inferred. - if (BII->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED && - BIMergeI->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED) { - BIMergeI->MispredictedCount += BII->MispredictedCount; - } else { - BIMergeI->MispredictedCount = BinaryBasicBlock::COUNT_INFERRED; - } - - ++BBMergeSI; - ++BII; - ++BIMergeI; - } - assert(BBMergeSI == BBMerge->succ_end()); - - ++BBMergeI; - } - assert(BBMergeI == BF.end()); -} - BinaryFunction::BasicBlockOrderType BinaryFunction::dfs() const { BasicBlockOrderType DFS; unsigned Index = 0; @@ -4058,6 +3427,28 @@ DWARFAddressRangesVector BinaryFunction::translateInputToOutputRanges( return MergedRanges; } +MCInst *BinaryFunction::getInstructionAtOffset(uint64_t Offset) { + if (CurrentState == State::Disassembled) { + auto II = Instructions.find(Offset); + return (II == Instructions.end()) ? nullptr : &II->second; + } else if (CurrentState == State::CFG) { + auto *BB = getBasicBlockContainingOffset(Offset); + if (!BB) + return nullptr; + + for (auto &Inst : *BB) { + constexpr auto InvalidOffset = std::numeric_limits::max(); + if (Offset == BC.MIA->getAnnotationWithDefault(Inst, "Offset", + InvalidOffset)) + return &Inst; + } + + return nullptr; + } else { + llvm_unreachable("invalid CFG state to use getInstructionAtOffset()"); + } +} + DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList( const DWARFDebugLoc::LocationList &InputLL, uint64_t BaseAddress) const { @@ -4331,60 +3722,6 @@ DynoStats BinaryFunction::getDynoStats() const { return Stats; } -Optional, 16>> -BinaryFunction::getFallthroughsInTrace(uint64_t From, uint64_t To) const { - SmallVector, 16> Res; - - if (CurrentState != State::Disassembled) - return NoneType(); - - // Get iterators and validate trace start/end - auto FromIter = Instructions.find(From); - if (FromIter == Instructions.end()) - return NoneType(); - - auto ToIter = Instructions.find(To); - if (ToIter == Instructions.end()) - return NoneType(); - - // Trace needs to go forward - if (FromIter->first > ToIter->first) - return NoneType(); - - // Trace needs to finish in a branch - if (!BC.MIA->isBranch(ToIter->second) && !BC.MIA->isCall(ToIter->second) && - !BC.MIA->isReturn(ToIter->second)) { - // Check for "rep ret" - if (!BC.MIA->isPrefix(ToIter->second)) { - return NoneType(); - } else { - ++ToIter; - if (!BC.MIA->isReturn(ToIter->second)) - return NoneType(); - } - } - - // Analyze intermediate instructions - for (; FromIter != ToIter; ++FromIter) { - // This operates under an assumption that we collect all branches in LBR - // No unconditional branches in the middle of the trace - if (BC.MIA->isUnconditionalBranch(FromIter->second) || - BC.MIA->isReturn(FromIter->second) || - BC.MIA->isCall(FromIter->second)) - return NoneType(); - - if (!BC.MIA->isConditionalBranch(FromIter->second)) - continue; - - const uint64_t Src = FromIter->first; - auto Next = std::next(FromIter); - const uint64_t Dst = Next->first; - Res.push_back(std::make_pair(Src, Dst)); - } - - return Res; -} - void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const { auto printStatWithDelta = [&](const std::string &Name, uint64_t Stat, uint64_t OtherStat) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index e0157143c732..472890a6e327 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -172,7 +172,7 @@ public: enum class State : char { Empty = 0, /// Function body is empty. Disassembled, /// Function have been disassembled. - CFG, /// Control flow graph have been built. + CFG, /// Control flow graph has been built. CFG_Finalized, /// CFG is finalized. No optimizations allowed. Emitted, /// Instructions have been emitted to output. }; @@ -186,16 +186,6 @@ public: ST_ALL, /// Split all functions }; - enum ReorderType : char { - RT_NONE = 0, - RT_EXEC_COUNT, - RT_HFSORT, - RT_HFSORT_PLUS, - RT_PETTIS_HANSEN, - RT_RANDOM, - RT_USER - }; - /// Branch statistics for jump table entries. struct JumpInfo { uint64_t Mispreds{0}; @@ -447,7 +437,6 @@ private: using BranchListType = std::vector>; BranchListType TakenBranches; /// All local taken branches. - BranchListType FTBranches; /// All fall-through branches. BranchListType IgnoredBranches; /// Branches ignored by CFG purposes. /// Map offset in the function to a label. @@ -754,13 +743,8 @@ private: } /// Return instruction at a given offset in the function. Valid before - /// CFG is constructed. - MCInst *getInstructionAtOffset(uint64_t Offset) { - assert(CurrentState == State::Disassembled && - "can only call function in Disassembled state"); - auto II = Instructions.find(Offset); - return (II == Instructions.end()) ? nullptr : &II->second; - } + /// CFG is constructed or while instruction offsets are available in CFG. + MCInst *getInstructionAtOffset(uint64_t Offset); /// Analyze and process indirect branch \p Instruction before it is /// added to Instructions list. @@ -1480,6 +1464,13 @@ public: ProfileMatchRatio == 1.0f; } + /// Mark this function as having a valid profile. + void markProfiled() { + if (ExecutionCount == COUNT_NO_PROFILE) + ExecutionCount = 0; + ProfileMatchRatio = 1.0f; + } + void addCFIInstruction(uint64_t Offset, MCCFIInstruction &&Inst) { assert(!Instructions.empty()); @@ -1809,6 +1800,12 @@ public: /// State::CFG. Returns false if CFG cannot be built. bool buildCFG(); + /// Read any kind of profile information available for the function. + void readProfile(); + + /// Perform post-processing of the CFG. + void postProcessCFG(); + /// Verify that any assumptions we've made about indirect branches were /// correct and also make any necessary changes to unknown indirect branches. /// @@ -2022,9 +2019,41 @@ public: return UnitLineTable; } - /// Scan from - to offsets for conditional jumps + /// Update function execution profile with a recorded trace. + /// A trace is region of code executed between two LBR entries supplied in + /// execution order. + /// + /// Return true if the trace is valid, false otherwise. + bool recordTrace( + const LBREntry &First, + const LBREntry &Second, + uint64_t Count = 1, + SmallVector, 16> *Branches = nullptr); + + /// Update function profile with a taken branch. + /// \p Count could be 0 if verification of the branch is required. + /// + /// Return true if the branch is valid, false otherwise. + bool recordBranch(uint64_t From, uint64_t To, uint64_t Count = 1, + uint64_t Mispreds = 0); + + /// Record external entry into the function. + /// + /// Return true if the entry is valid, false otherwise. + bool recordEntry(uint64_t To, bool Mispred, uint64_t Count = 1); + + /// Record exit from a function via a call or return. + /// + /// Return true if the exit point is valid, false otherwise. + bool recordExit(uint64_t From, bool Mispred, uint64_t Count = 1); + + /// Finalize profile for the function. + void postProcessProfile(); + + /// Return a vector of offsets corresponding to a trace in a function + /// (see recordTrace() above). Optional, 16>> - getFallthroughsInTrace(uint64_t From, uint64_t To) const; + getFallthroughsInTrace(const LBREntry &First, const LBREntry &Second); /// Returns an estimate of the function's hot part after splitting. /// This is a very rough estimate, as with C++ exceptions there are @@ -2181,6 +2210,13 @@ inline raw_ostream &operator<<(raw_ostream &OS, return OS; } +inline raw_ostream &operator<<(raw_ostream &OS, + const LBREntry &LBR) { + OS << "0x" << Twine::utohexstr(LBR.From) + << " -> 0x" << Twine::utohexstr(LBR.To); + return OS; +} + } // namespace bolt diff --git a/bolt/BinaryFunctionProfile.cpp b/bolt/BinaryFunctionProfile.cpp new file mode 100644 index 000000000000..66bf634ef6e9 --- /dev/null +++ b/bolt/BinaryFunctionProfile.cpp @@ -0,0 +1,854 @@ +//===--- BinaryFunctionProfile.cpp --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + + +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" +#include "DataReader.h" +#include "Passes/MCF.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt-prof" + +using namespace llvm; +using namespace bolt; + +namespace opts { + +extern cl::OptionCategory AggregatorCategory; +extern cl::OptionCategory BoltOptCategory; + +extern cl::opt Verbosity; +extern cl::opt IndirectCallPromotion; +extern cl::opt JumpTables; + +static cl::opt +CompatMode("prof-compat-mode", + cl::desc("maintain bug-level compatibility with old profile"), + cl::init(true), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +DoMCF("mcf", + cl::desc("solve a min cost flow problem on the CFG to fix edge counts " + "(default=disable)"), + cl::init(MCF_DISABLE), + cl::values( + clEnumValN(MCF_DISABLE, "none", + "disable MCF"), + clEnumValN(MCF_LINEAR, "linear", + "cost function is inversely proportional to edge count"), + clEnumValN(MCF_QUADRATIC, "quadratic", + "cost function is inversely proportional to edge count squared"), + clEnumValN(MCF_LOG, "log", + "cost function is inversely proportional to log of edge count"), + clEnumValN(MCF_BLAMEFTS, "blamefts", + "tune cost to blame fall-through edges for surplus flow"), + clEnumValEnd), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +FixFuncCounts("fix-func-counts", + cl::desc("adjust function counts based on basic blocks execution count"), + cl::init(false), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +} // namespace opts + +namespace llvm { +namespace bolt { + +bool BinaryFunction::recordTrace( + const LBREntry &First, + const LBREntry &Second, + uint64_t Count, + SmallVector, 16> *Branches) { + if (!isSimple()) + return false; + + assert(CurrentState == State::CFG && "can only record traces in CFG state"); + + // Offsets of the trace within this function. + const auto From = First.To - getAddress(); + const auto To = Second.From - getAddress(); + + if (From > To) + return false; + + auto *FromBB = getBasicBlockContainingOffset(From); + auto *ToBB = getBasicBlockContainingOffset(To); + + if (!FromBB || !ToBB) + return false; + + // Fill out information for fall-through edges. The From and To could be + // within the same basic block, e.g. when two call instructions are in the + // same block. In this case we skip the processing. + if (FromBB == ToBB) { + if (opts::CompatMode) + return true; + + // If the previous block ended with a call, the destination of a return + // would be in ToBB basic block. And if the ToBB starts with a control + // transfer instruction, we will have a 0-length trace that we have to + // account for as a fall-through edge. + if (To == ToBB->getOffset()) { + // External entry point. + if (ToBB->isEntryPoint() || ToBB->isLandingPad()) + return true; + + // Check that the origin LBR of a trace starts in another function. + // Otherwise it's an internal branch that was accounted for. + if (containsAddress(First.From)) + return true; + + auto *PrevBB = BasicBlocksLayout[ToBB->getIndex() - 1]; + + // This could be a bad trace. + if (!PrevBB->getSuccessor(ToBB->getLabel())) { + DEBUG(dbgs() << "invalid LBR sequence:\n" + << " " << First << '\n' + << " " << Second << '\n'); + return false; + } + + auto &BI = PrevBB->getBranchInfo(*ToBB); + BI.Count += Count; + if (Branches) { + const auto *Instr = PrevBB->getLastNonPseudoInstr(); + const auto Offset = + BC.MIA->getAnnotationWithDefault(*Instr, "Offset"); + Branches->push_back(std::make_pair(Offset, ToBB->getOffset())); + } + } + + return true; + } + + // Process blocks in the original layout order. + auto *BB = BasicBlocksLayout[FromBB->getIndex()]; + assert(BB == FromBB && "index mismatch"); + while (BB != ToBB) { + auto *NextBB = BasicBlocksLayout[BB->getIndex() + 1]; + assert((NextBB && NextBB->getOffset() > BB->getOffset()) && "bad layout"); + + // Check for bad LBRs. + if (!BB->getSuccessor(NextBB->getLabel())) { + DEBUG(dbgs() << "no fall-through for the trace:\n" + << " " << First << '\n' + << " " << Second << '\n'); + return false; + } + + // To keep backwards compatibility we skip recording fall-throughs that + // are not a result of a conditional jump. + if (!opts::CompatMode || + (BB->succ_size() == 2 && + BB->getConditionalSuccessor(false) == NextBB)) { + auto &BI = BB->getBranchInfo(*NextBB); + BI.Count += Count; + + if (Branches) { + const auto *Instr = BB->getLastNonPseudoInstr(); + // Note: real offset for conditional jump instruction shouldn't be 0. + const auto Offset = + BC.MIA->getAnnotationWithDefault(*Instr, "Offset"); + if (Offset) { + Branches->push_back(std::make_pair(Offset, NextBB->getOffset())); + } + } + } + + BB = NextBB; + } + + return true; +} + +bool BinaryFunction::recordBranch(uint64_t From, uint64_t To, + uint64_t Count, uint64_t Mispreds) { + auto *FromBB = getBasicBlockContainingOffset(From); + auto *ToBB = getBasicBlockContainingOffset(To); + + if (!FromBB || !ToBB) { + DEBUG(dbgs() << "failed to get block for recorded branch\n"); + return false; + } + + // Could be bad LBR data. Ignore, or report as a bad profile for backwards + // compatibility. + if (From == To) { + if (!opts::CompatMode) + return true; + auto *Instr = getInstructionAtOffset(0); + if (Instr && BC.MIA->isCall(*Instr)) + return true; + return false; + } + + if (FromBB->succ_size() == 0) { + // Return from a tail call. + return true; + } + + // Very rarely we will see ignored branches. Do a linear check. + for (auto &Branch : IgnoredBranches) { + if (Branch == std::make_pair(static_cast(From), + static_cast(To))) + return true; + } + + if (To != ToBB->getOffset()) { + // "To" could be referring to nop instructions in between 2 basic blocks. + // While building the CFG we make sure these nops are attributed to the + // previous basic block, thus we check if the destination belongs to the + // gap past the last instruction. + const auto *LastInstr = ToBB->getLastNonPseudoInstr(); + if (LastInstr) { + const auto LastInstrOffset = + BC.MIA->getAnnotationWithDefault(*LastInstr, "Offset"); + + // With old .fdata we are getting FT branches for "jcc,jmp" sequences. + if (To == LastInstrOffset && BC.MIA->isUnconditionalBranch(*LastInstr)) { + return true; + } + + if (To <= LastInstrOffset) { + DEBUG(dbgs() << "branch recorded into the middle of the block" << " in " + << *this << " : " << From << " -> " << To << '\n'); + return false; + } + } + + // The real destination is the layout successor of the detected ToBB. + if (ToBB == BasicBlocksLayout.back()) + return false; + auto *NextBB = BasicBlocksLayout[ToBB->getIndex() + 1]; + assert((NextBB && NextBB->getOffset() > ToBB->getOffset()) && "bad layout"); + ToBB = NextBB; + } + + // If there's no corresponding instruction for 'From', we have probably + // discarded it as a FT from __builtin_unreachable. + auto *FromInstruction = getInstructionAtOffset(From); + if (!FromInstruction) { + DEBUG(dbgs() << "no instruction for offset " << From << " in " + << *this << '\n'); + return false; + } + + if (FromBB == ToBB) { + // Check for a return from a recursive call. + // Otherwise it's a simple loop. + } + + if (!FromBB->getSuccessor(ToBB->getLabel())) { + // Check if this is a recursive call or a return from a recursive call. + if (ToBB->isEntryPoint()) { + // Execution count is already accounted for. + return true; + } + + DEBUG(dbgs() << "invalid branch in " << *this << '\n' + << Twine::utohexstr(From) << " -> " + << Twine::utohexstr(To) << '\n'); + return false; + } + + auto &BI = FromBB->getBranchInfo(*ToBB); + BI.Count += Count; + // Only update mispredicted count if it the count was real. + if (Count) { + BI.MispredictedCount += Mispreds; + } + + return true; +} + +bool BinaryFunction::recordEntry(uint64_t To, bool Mispred, uint64_t Count) { + if (To > getSize()) + return false; + + if (!hasProfile()) + ExecutionCount = 0; + + if (To == 0) + ExecutionCount += Count; + + return true; +} + +bool BinaryFunction::recordExit(uint64_t From, bool Mispred, uint64_t Count) { + if (!isSimple()) + return false; + assert(From <= getSize() && "wrong From address"); + + if (!hasProfile()) + ExecutionCount = 0; + + return true; +} + +void BinaryFunction::postProcessProfile() { + if (!hasValidProfile()) { + clearProfile(); + return; + } + + // Check if MCF post-processing was requested. + if (opts::DoMCF != MCF_DISABLE) { + removeTagsFromProfile(); + solveMCF(*this, opts::DoMCF); + return; + } + + // Is we are using non-LBR sampling there's nothing left to do. + if (!BranchData) + return; + + // Bug compatibility with previous version - double accounting for conditional + // jump into a fall-through block. + if (opts::CompatMode) { + for (auto *BB : BasicBlocks) { + if (BB->succ_size() == 2 && + BB->getConditionalSuccessor(false) == + BB->getConditionalSuccessor(true)) { + auto &TakenBI = *BB->branch_info_begin(); + auto &FallThroughBI = *BB->branch_info_rbegin(); + FallThroughBI.Count = TakenBI.Count; + FallThroughBI.MispredictedCount = 0; + } + } + } + + // Pre-sort branch data. + std::stable_sort(BranchData->Data.begin(), BranchData->Data.end()); + + // If we have at least some branch data for the function indicate that it + // was executed. + if (opts::FixFuncCounts && ExecutionCount == 0) { + ExecutionCount = 1; + } + + // Compute preliminary execution count for each basic block + for (auto *BB : BasicBlocks) { + BB->ExecutionCount = 0; + } + for (auto *BB : BasicBlocks) { + auto SuccBIIter = BB->branch_info_begin(); + for (auto Succ : BB->successors()) { + if (SuccBIIter->Count != BinaryBasicBlock::COUNT_NO_PROFILE) + Succ->setExecutionCount(Succ->getExecutionCount() + SuccBIIter->Count); + ++SuccBIIter; + } + } + + // Set entry BBs to zero, we'll update their execution count next with entry + // data (we maintain a separate data structure for branches to function entry + // points) + for (auto *BB : BasicBlocks) { + if (BB->isEntryPoint()) + BB->ExecutionCount = 0; + } + + // Update execution counts of landing pad blocks and entry BBs + // There is a slight skew introduced here as branches originated from RETs + // may be accounted for in the execution count of an entry block if the last + // instruction in a predecessor fall-through block is a call. This situation + // should rarely happen because there are few multiple-entry functions. + for (const auto &I : BranchData->EntryData) { + BinaryBasicBlock *BB = getBasicBlockAtOffset(I.To.Offset); + if (BB && (BB->isEntryPoint() || BB->isLandingPad())) { + BB->setExecutionCount(BB->getExecutionCount() + I.Branches); + } + } + + inferFallThroughCounts(); + + // Update profile information for jump tables based on CFG branch data. + for (auto *BB : BasicBlocks) { + const auto *LastInstr = BB->getLastNonPseudoInstr(); + if (!LastInstr) + continue; + const auto JTAddress = BC.MIA->getJumpTable(*LastInstr); + if (!JTAddress) + continue; + auto *JT = getJumpTableContainingAddress(JTAddress); + if (!JT) + continue; + + uint64_t TotalBranchCount = 0; + for (const auto &BranchInfo : BB->branch_info()) { + TotalBranchCount += BranchInfo.Count; + } + JT->Count += TotalBranchCount; + + if (opts::IndirectCallPromotion < ICP_JUMP_TABLES && + opts::JumpTables < JTS_AGGRESSIVE) + continue; + + if (JT->Counts.empty()) + JT->Counts.resize(JT->Entries.size()); + auto EI = JT->Entries.begin(); + auto Delta = (JTAddress - JT->Address) / JT->EntrySize; + EI += Delta; + while (EI != JT->Entries.end()) { + const auto *TargetBB = getBasicBlockForLabel(*EI); + if (TargetBB) { + const auto &BranchInfo = BB->getBranchInfo(*TargetBB); + assert(Delta < JT->Counts.size()); + JT->Counts[Delta].Count += BranchInfo.Count; + JT->Counts[Delta].Mispreds += BranchInfo.MispredictedCount; + } + ++Delta; + ++EI; + // A label marks the start of another jump table. + if (JT->Labels.count(Delta * JT->EntrySize)) + break; + } + } +} + +Optional, 16>> +BinaryFunction::getFallthroughsInTrace(const LBREntry &First, + const LBREntry &Second) { + SmallVector, 16> Res; + + if (!recordTrace(First, Second, 1, &Res)) + return NoneType(); + + return Res; +} + +void BinaryFunction::readProfile() { + if (empty()) + return; + + if (!BC.DR.hasLBR()) { + readSampleData(); + return; + } + + // Possibly assign/re-assign branch profile data. + matchProfileData(); + + if (!BranchData) + return; + + uint64_t MismatchedBranches = 0; + for (const auto &BI : BranchData->Data) { + if (BI.From.Name != BI.To.Name) { + continue; + } + + if (!recordBranch(BI.From.Offset, BI.To.Offset, + BI.Branches, BI.Mispreds)) { + DEBUG(dbgs() << "bad branch : " << BI.From.Offset << " -> " + << BI.To.Offset << '\n'); + ++MismatchedBranches; + } + } + + // Special profile data propagation is required for conditional tail calls. + for (auto BB : BasicBlocks) { + auto *CTCInstr = BB->getLastNonPseudoInstr(); + if (!CTCInstr || !BC.MIA->getConditionalTailCall(*CTCInstr)) + continue; + + auto OffsetOrErr = + BC.MIA->tryGetAnnotationAs(*CTCInstr, "Offset"); + assert(OffsetOrErr && "offset not set for conditional tail call"); + + auto BranchInfoOrErr = BranchData->getDirectCallBranch(*OffsetOrErr); + if (!BranchInfoOrErr) + continue; + + BC.MIA->addAnnotation(BC.Ctx.get(), *CTCInstr, "CTCTakenCount", + BranchInfoOrErr->Branches); + BC.MIA->addAnnotation(BC.Ctx.get(), *CTCInstr, "CTCMispredCount", + BranchInfoOrErr->Mispreds); + } +} + +void BinaryFunction::mergeProfileDataInto(BinaryFunction &BF) const { + // No reason to merge invalid or empty profiles into BF. + if (!hasValidProfile()) + return; + + // Update function execution count. + if (getExecutionCount() != BinaryFunction::COUNT_NO_PROFILE) { + BF.setExecutionCount(BF.getKnownExecutionCount() + getExecutionCount()); + } + + // Since we are merging a valid profile, the new profile should be valid too. + // It has either already been valid, or it has been cleaned up. + BF.ProfileMatchRatio = 1.0f; + + // Update basic block and edge counts. + auto BBMergeI = BF.begin(); + for (BinaryBasicBlock *BB : BasicBlocks) { + BinaryBasicBlock *BBMerge = &*BBMergeI; + assert(getIndex(BB) == BF.getIndex(BBMerge)); + + // Update basic block count. + if (BB->getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE) { + BBMerge->setExecutionCount( + BBMerge->getKnownExecutionCount() + BB->getExecutionCount()); + } + + // Update edge count for successors of this basic block. + auto BBMergeSI = BBMerge->succ_begin(); + auto BIMergeI = BBMerge->branch_info_begin(); + auto BII = BB->branch_info_begin(); + for (const auto *BBSucc : BB->successors()) { + (void)BBSucc; + assert(getIndex(BBSucc) == BF.getIndex(*BBMergeSI)); + + // At this point no branch count should be set to COUNT_NO_PROFILE. + assert(BII->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + "unexpected unknown branch profile"); + assert(BIMergeI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + "unexpected unknown branch profile"); + + BIMergeI->Count += BII->Count; + + // When we merge inferred and real fall-through branch data, the merged + // data is considered inferred. + if (BII->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED && + BIMergeI->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED) { + BIMergeI->MispredictedCount += BII->MispredictedCount; + } else { + BIMergeI->MispredictedCount = BinaryBasicBlock::COUNT_INFERRED; + } + + ++BBMergeSI; + ++BII; + ++BIMergeI; + } + assert(BBMergeSI == BBMerge->succ_end()); + + ++BBMergeI; + } + assert(BBMergeI == BF.end()); +} + +void BinaryFunction::readSampleData() { + auto SampleDataOrErr = BC.DR.getFuncSampleData(getNames()); + + if (!SampleDataOrErr) + return; + + // Non-LBR mode territory + // First step is to assign BB execution count based on samples from perf + ProfileMatchRatio = 1.0f; + removeTagsFromProfile(); + bool NormalizeByInsnCount = + BC.DR.usesEvent("cycles") || BC.DR.usesEvent("instructions"); + bool NormalizeByCalls = BC.DR.usesEvent("branches"); + static bool NagUser{true}; + if (NagUser) { + outs() << "BOLT-INFO: operating with non-LBR profiling data.\n"; + if (NormalizeByInsnCount) { + outs() << "BOLT-INFO: normalizing samples by instruction count.\n"; + } else if (NormalizeByCalls) { + outs() << "BOLT-INFO: normalizing samples by branches.\n"; + } + NagUser = false; + } + uint64_t LastOffset = getSize(); + uint64_t TotalEntryCount{0}; + for (auto I = BasicBlockOffsets.rbegin(), E = BasicBlockOffsets.rend(); + I != E; ++I) { + uint64_t CurOffset = I->first; + // Always work with samples multiplied by 1000 to avoid losing them if we + // later need to normalize numbers + uint64_t NumSamples = + SampleDataOrErr->getSamples(CurOffset, LastOffset) * 1000; + if (NormalizeByInsnCount && I->second->getNumNonPseudos()) + NumSamples /= I->second->getNumNonPseudos(); + else if (NormalizeByCalls) { + uint32_t NumCalls = I->second->getNumCalls(); + NumSamples /= NumCalls + 1; + } + I->second->setExecutionCount(NumSamples); + if (I->second->isEntryPoint()) + TotalEntryCount += NumSamples; + LastOffset = CurOffset; + } + ExecutionCount = TotalEntryCount; + + estimateEdgeCounts(BC, *this); + + if (opts::DoMCF != MCF_DISABLE) + solveMCF(*this, opts::DoMCF); +} + +void BinaryFunction::inferFallThroughCounts() { + // Work on a basic block at a time, propagating frequency information + // forwards. + // It is important to walk in the layout order. + for (auto *BB : BasicBlocks) { + const uint64_t BBExecCount = BB->getExecutionCount(); + + // Propagate this information to successors, filling in fall-through edges + // with frequency information + if (BB->succ_size() == 0) + continue; + + // Calculate frequency of outgoing branches from this node according to + // LBR data. + uint64_t ReportedBranches = 0; + for (const auto &SuccBI : BB->branch_info()) { + if (SuccBI.Count != BinaryBasicBlock::COUNT_NO_PROFILE) + ReportedBranches += SuccBI.Count; + } + + // Get taken count of conditional tail call if the block ends with one. + uint64_t CTCTakenCount = 0; + const auto CTCInstr = BB->getLastNonPseudoInstr(); + if (CTCInstr && BC.MIA->getConditionalTailCall(*CTCInstr)) { + CTCTakenCount = + BC.MIA->getAnnotationWithDefault(*CTCInstr, "CTCTakenCount"); + } + + // Calculate frequency of throws from this node according to LBR data + // for branching into associated landing pads. Since it is possible + // for a landing pad to be associated with more than one basic blocks, + // we may overestimate the frequency of throws for such blocks. + uint64_t ReportedThrows = 0; + for (const auto *LP: BB->landing_pads()) { + ReportedThrows += LP->getExecutionCount(); + } + + const uint64_t TotalReportedJumps = + ReportedBranches + CTCTakenCount + ReportedThrows; + + // Infer the frequency of the fall-through edge, representing not taking the + // branch. + uint64_t Inferred = 0; + if (BBExecCount > TotalReportedJumps) + Inferred = BBExecCount - TotalReportedJumps; + + DEBUG( + if (BBExecCount < TotalReportedJumps) + dbgs() + << "Fall-through inference is slightly inconsistent. " + "exec frequency is less than the outgoing edges frequency (" + << BBExecCount << " < " << ReportedBranches + << ") for BB at offset 0x" + << Twine::utohexstr(getAddress() + BB->getOffset()) << '\n'; + ); + + if (BB->succ_size() <= 2) { + // Skip if the last instruction is an unconditional jump. + const auto *LastInstr = BB->getLastNonPseudoInstr(); + if (LastInstr && + (BC.MIA->isUnconditionalBranch(*LastInstr) || + BC.MIA->isIndirectBranch(*LastInstr))) + continue; + // If there is an FT it will be the last successor. + auto &SuccBI = *BB->branch_info_rbegin(); + auto &Succ = *BB->succ_rbegin(); + if (SuccBI.Count == 0) { + SuccBI.Count = Inferred; + SuccBI.MispredictedCount = BinaryBasicBlock::COUNT_INFERRED; + Succ->ExecutionCount += Inferred; + } + } + } + + return; +} + +bool BinaryFunction::fetchProfileForOtherEntryPoints() { + if (!BranchData) + return false; + + // Check if we are missing profiling data for secondary entry points + bool First{true}; + bool Updated{false}; + for (auto BB : BasicBlocks) { + if (First) { + First = false; + continue; + } + if (BB->isEntryPoint()) { + uint64_t EntryAddress = BB->getOffset() + getAddress(); + // Look for branch data associated with this entry point + std::vector Names; + std::multimap::iterator I, E; + for (std::tie(I, E) = BC.GlobalAddresses.equal_range(EntryAddress); + I != E; ++I) { + Names.push_back(I->second); + } + if (!Names.empty()) { + if (FuncBranchData *Data = BC.DR.getFuncBranchData(Names)) { + BranchData->appendFrom(*Data, BB->getOffset()); + Data->Used = true; + Updated = true; + } + } + } + } + return Updated; +} + +void BinaryFunction::matchProfileMemData() { + const auto AllMemData = BC.DR.getFuncMemDataRegex(getNames()); + for (auto *NewMemData : AllMemData) { + // Prevent functions from sharing the same profile. + if (NewMemData->Used) + continue; + + if (MemData) + MemData->Used = false; + + // Update function profile data with the new set. + MemData = NewMemData; + MemData->Used = true; + break; + } +} + +void BinaryFunction::matchProfileData() { + // This functionality is available for LBR-mode only + // TODO: Implement evaluateProfileData() for samples, checking whether + // sample addresses match instruction addresses in the function + if (!BC.DR.hasLBR()) + return; + + if (BranchData) { + ProfileMatchRatio = evaluateProfileData(*BranchData); + if (ProfileMatchRatio == 1.0f) { + if (fetchProfileForOtherEntryPoints()) { + ProfileMatchRatio = evaluateProfileData(*BranchData); + ExecutionCount = BranchData->ExecutionCount; + } + return; + } + } + + // Check if the function name can fluctuate between several compilations + // possibly triggered by minor unrelated code changes in the source code + // of the input binary. + const auto HasVolatileName = [this]() { + for (const auto Name : getNames()) { + if (getLTOCommonName(Name)) + return true; + } + return false; + }(); + if (!HasVolatileName) + return; + + // Check for a profile that matches with 100% confidence. + const auto AllBranchData = BC.DR.getFuncBranchDataRegex(getNames()); + for (auto *NewBranchData : AllBranchData) { + // Prevent functions from sharing the same profile. + if (NewBranchData->Used) + continue; + + if (evaluateProfileData(*NewBranchData) != 1.0f) + continue; + + if (BranchData) + BranchData->Used = false; + + // Update function profile data with the new set. + BranchData = NewBranchData; + ExecutionCount = NewBranchData->ExecutionCount; + ProfileMatchRatio = 1.0f; + BranchData->Used = true; + break; + } +} + +float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { + // Until we define a minimal profile, we consider an empty branch data to be + // a valid profile. It could happen to a function without branches when we + // still have an EntryData for execution count. + if (BranchData.Data.empty()) { + return 1.0f; + } + + uint64_t NumMatchedBranches = 0; + for (const auto &BI : BranchData.Data) { + bool IsValid = false; + if (BI.From.Name == BI.To.Name) { + // Try to record information with 0 count. + IsValid = recordBranch(BI.From.Offset, BI.To.Offset, 0); + } else { + // The branch has to originate from this function. + // Check for calls, tail calls, rets and indirect branches. + // When matching profiling info, we did not reach the stage + // when we identify tail calls, so they are still represented + // by regular branch instructions and we need isBranch() here. + auto *Instr = getInstructionAtOffset(BI.From.Offset); + // If it's a prefix - skip it. + if (Instr && BC.MIA->isPrefix(*Instr)) + Instr = getInstructionAtOffset(BI.From.Offset + 1); + if (Instr && + (BC.MIA->isCall(*Instr) || + BC.MIA->isBranch(*Instr) || + BC.MIA->isReturn(*Instr))) { + IsValid = true; + } + } + + if (IsValid) { + ++NumMatchedBranches; + continue; + } + + DEBUG(dbgs() + << "\tinvalid branch in " << *this << " : 0x" + << Twine::utohexstr(BI.From.Offset) << " -> "; + if (BI.From.Name == BI.To.Name) + dbgs() << "0x" << Twine::utohexstr(BI.To.Offset) << '\n'; + else + dbgs() << "\n"; + ); + } + + const auto MatchRatio = (float) NumMatchedBranches / BranchData.Data.size(); + if (opts::Verbosity >= 2 && NumMatchedBranches < BranchData.Data.size()) { + errs() << "BOLT-WARNING: profile branches match only " + << format("%.1f%%", MatchRatio * 100.0f) << " (" + << NumMatchedBranches << '/' << BranchData.Data.size() + << ") for function " << *this << '\n'; + } + + return MatchRatio; +} + +void BinaryFunction::clearProfile() { + // Keep function execution profile the same. Only clear basic block and edge + // counts. + for (auto *BB : BasicBlocks) { + BB->ExecutionCount = 0; + for (auto &BI : BB->branch_info()) { + BI.Count = 0; + BI.MispredictedCount = 0; + } + } +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index facc0b5ddee0..687c10497765 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -345,7 +345,7 @@ void BinaryFunctionPassManager::runAllPasses( // order they're registered. // Run this pass first to use stats for the original functions. - Manager.registerPass(llvm::make_unique(NeverPrint)); + Manager.registerPass(llvm::make_unique(NeverPrint)); Manager.registerPass(llvm::make_unique(NeverPrint), opts::StripRepRet); diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index dd8a44975134..959b19915f10 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -63,6 +63,7 @@ add_llvm_tool(llvm-bolt BinaryBasicBlock.cpp BinaryContext.cpp BinaryFunction.cpp + BinaryFunctionProfile.cpp BinaryPassManager.cpp CacheMetrics.cpp DataAggregator.cpp diff --git a/bolt/DataAggregator.cpp b/bolt/DataAggregator.cpp index dbce1e4ec465..a964c73069e4 100644 --- a/bolt/DataAggregator.cpp +++ b/bolt/DataAggregator.cpp @@ -414,6 +414,14 @@ bool DataAggregator::aggregate(BinaryContext &BC, outs() << "PERF2BOLT: Failed to parse branch events\n"; } + // Mark all functions with registered events as having a valid profile. + for (auto &BFI : BFs) { + auto &BF = BFI.second; + if (BF.getBranchData()) { + BF.markProfiled(); + } + } + auto PI3 = sys::Wait(MemEventsPI, 0, true, &Error); if (PI3.ReturnCode != 0) { @@ -423,7 +431,8 @@ bool DataAggregator::aggregate(BinaryContext &BC, deleteTempFiles(); - Regex NoData("Samples for '.*' event do not have ADDR attribute set. Cannot print 'addr' field."); + Regex NoData("Samples for '.*' event do not have ADDR attribute set. " + "Cannot print 'addr' field."); if (!NoData.match(ErrBuf)) { errs() << "PERF-ERROR: Return code " << PI3.ReturnCode << "\n"; errs() << ErrBuf; @@ -450,7 +459,7 @@ bool DataAggregator::aggregate(BinaryContext &BC, } deleteTempFiles(); - + return true; } @@ -467,8 +476,8 @@ DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) { return &FI->second; } -bool DataAggregator::doIntraBranch(BinaryFunction *Func, uint64_t From, - uint64_t To, bool Mispred) { +bool +DataAggregator::doIntraBranch(BinaryFunction *Func, const LBREntry &Branch) { FuncBranchData *AggrData = Func->getBranchData(); if (!AggrData) { AggrData = &FuncsToBranches[Func->getNames()[0]]; @@ -476,19 +485,21 @@ bool DataAggregator::doIntraBranch(BinaryFunction *Func, uint64_t From, Func->setBranchData(AggrData); } - From -= Func->getAddress(); - To -= Func->getAddress(); - AggrData->bumpBranchCount(From, To, Mispred); + AggrData->bumpBranchCount(Branch.From - Func->getAddress(), + Branch.To - Func->getAddress(), + Branch.Mispred); return true; } bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, - BinaryFunction *ToFunc, uint64_t From, - uint64_t To, bool Mispred) { + BinaryFunction *ToFunc, + const LBREntry &Branch) { FuncBranchData *FromAggrData{nullptr}; FuncBranchData *ToAggrData{nullptr}; StringRef SrcFunc; StringRef DstFunc; + auto From = Branch.From; + auto To = Branch.To; if (FromFunc) { SrcFunc = FromFunc->getNames()[0]; FromAggrData = FromFunc->getBranchData(); @@ -498,6 +509,8 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, FromFunc->setBranchData(FromAggrData); } From -= FromFunc->getAddress(); + + FromFunc->recordExit(From, Branch.Mispred); } if (ToFunc) { DstFunc = ToFunc->getNames()[0]; @@ -508,32 +521,39 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, ToFunc->setBranchData(ToAggrData); } To -= ToFunc->getAddress(); + + ToFunc->recordEntry(To, Branch.Mispred); } if (FromAggrData) FromAggrData->bumpCallCount(From, Location(!DstFunc.empty(), DstFunc, To), - Mispred); + Branch.Mispred); if (ToAggrData) ToAggrData->bumpEntryCount(Location(!SrcFunc.empty(), SrcFunc, From), To, - Mispred); + Branch.Mispred); return true; } -bool DataAggregator::doBranch(uint64_t From, uint64_t To, bool Mispred) { - auto *FromFunc = getBinaryFunctionContainingAddress(From); - auto *ToFunc = getBinaryFunctionContainingAddress(To); +bool DataAggregator::doBranch(const LBREntry &Branch) { + auto *FromFunc = getBinaryFunctionContainingAddress(Branch.From); + auto *ToFunc = getBinaryFunctionContainingAddress(Branch.To); if (!FromFunc && !ToFunc) return false; - if (FromFunc == ToFunc) - return doIntraBranch(FromFunc, From, To, Mispred); + if (FromFunc == ToFunc) { + FromFunc->recordBranch(Branch.From - FromFunc->getAddress(), + Branch.To - FromFunc->getAddress(), + 1, + Branch.Mispred); + return doIntraBranch(FromFunc, Branch); + } - return doInterBranch(FromFunc, ToFunc, From, To, Mispred); + return doInterBranch(FromFunc, ToFunc, Branch); } -bool DataAggregator::doTrace(uint64_t From, uint64_t To) { - auto *FromFunc = getBinaryFunctionContainingAddress(From); - auto *ToFunc = getBinaryFunctionContainingAddress(To); +bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second) { + auto *FromFunc = getBinaryFunctionContainingAddress(First.To); + auto *ToFunc = getBinaryFunctionContainingAddress(Second.From); if (!FromFunc || !ToFunc) { ++NumLongRangeTraces; return false; @@ -541,26 +561,25 @@ bool DataAggregator::doTrace(uint64_t From, uint64_t To) { if (FromFunc != ToFunc) { ++NumInvalidTraces; DEBUG(dbgs() << "Trace starting in " << FromFunc->getPrintName() << " @ " - << Twine::utohexstr(From - FromFunc->getAddress()) + << Twine::utohexstr(First.To - FromFunc->getAddress()) << " and ending in " << ToFunc->getPrintName() << " @ " << ToFunc->getPrintName() << " @ " - << Twine::utohexstr(To - ToFunc->getAddress()) << "\n"); + << Twine::utohexstr(Second.From - ToFunc->getAddress()) + << '\n'); return false; } - if (FromFunc) { - From -= FromFunc->getAddress(); - To -= ToFunc->getAddress(); - } - auto FTs = FromFunc->getFallthroughsInTrace(From, To); + auto FTs = FromFunc->getFallthroughsInTrace(First, Second); if (!FTs) { ++NumInvalidTraces; return false; } for (const auto &Pair : *FTs) { - doIntraBranch(FromFunc, Pair.first + FromFunc->getAddress(), - Pair.second + FromFunc->getAddress(), false); + doIntraBranch(FromFunc, + LBREntry{Pair.first + FromFunc->getAddress(), + Pair.second + FromFunc->getAddress(), + false}); } return true; @@ -710,7 +729,8 @@ bool DataAggregator::hasData() { std::error_code DataAggregator::parseBranchEvents() { outs() << "PERF2BOLT: Aggregating branch events...\n"; - NamedRegionTimer T("Branch samples parsing", TimerGroupName, opts::TimeAggregator); + NamedRegionTimer T("Branch samples parsing", TimerGroupName, + opts::TimeAggregator); uint64_t NumEntries{0}; uint64_t NumSamples{0}; uint64_t NumTraces{0}; @@ -727,14 +747,16 @@ std::error_code DataAggregator::parseBranchEvents() { NumEntries += Sample.LBR.size(); // Parser semantic actions - uint64_t Last{0}; + // LBRs are stored in reverse execution order. NextLBR refers to next + // executed branch record. + const LBREntry *NextLBR{nullptr}; for (const auto &LBR : Sample.LBR) { - if (Last) { - doTrace(LBR.To, Last); + if (NextLBR) { + doTrace(LBR, *NextLBR); ++NumTraces; } - doBranch(LBR.From, LBR.To, LBR.Mispred); - Last = LBR.From; + doBranch(LBR); + NextLBR = &LBR; } } outs() << "PERF2BOLT: Read " << NumSamples << " samples and " diff --git a/bolt/DataAggregator.h b/bolt/DataAggregator.h index 6dcac3f7daed..7c1b575be664 100644 --- a/bolt/DataAggregator.h +++ b/bolt/DataAggregator.h @@ -28,12 +28,6 @@ namespace bolt { class BinaryFunction; class BinaryContext; -struct LBREntry { - uint64_t From; - uint64_t To; - bool Mispred; -}; - struct PerfBranchSample { SmallVector LBR; }; @@ -125,24 +119,19 @@ class DataAggregator : public DataReader { BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address); /// Semantic actions - parser hooks to interpret parsed perf samples - /// Register an intraprocedural branch in \p Func with offsets \p From and - /// \p To (relative to \p Func start address). - bool doIntraBranch(BinaryFunction *Func, uint64_t From, uint64_t To, - bool Mispred); + /// Register an intraprocedural branch \p Branch. + bool doIntraBranch(BinaryFunction *Func, const LBREntry &Branch); /// Register an interprocedural branch from \p FromFunc to \p ToFunc with /// offsets \p From and \p To, respectively. bool doInterBranch(BinaryFunction *FromFunc, BinaryFunction *ToFunc, - uint64_t From, uint64_t To, bool Mispred); + const LBREntry &Branch); - /// Register a branch with raw addresses \p From and \p To extracted from the - /// LBR - bool doBranch(uint64_t From, uint64_t To, bool Mispred); + /// Register a \p Branch. + bool doBranch(const LBREntry &Branch); - /// Register a trace starting in raw address \p From and ending in \p To - /// This will add all intermediate conditional branches in this trace as not - /// taken. - bool doTrace(uint64_t From, uint64_t To); + /// Register a trace between two LBR entries supplied in execution order. + bool doTrace(const LBREntry &First, const LBREntry &Second); /// Parser helpers /// Return false if we exhausted our parser buffer and finished parsing diff --git a/bolt/DataReader.h b/bolt/DataReader.h index b3ba0999a932..474b1aa3b304 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -31,6 +31,12 @@ namespace llvm { namespace bolt { +struct LBREntry { + uint64_t From; + uint64_t To; + bool Mispred; +}; + /// LTO-generated function names take a form: /// /// .lto_priv./... diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 3e98e6300831..ddcd87974dac 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -48,6 +48,7 @@ const char* dynoStatsOptDesc(const bolt::DynoStats::Category C) { namespace opts { +extern cl::OptionCategory BoltCategory; extern cl::OptionCategory BoltOptCategory; extern cl::opt Verbosity; @@ -88,6 +89,33 @@ MinBranchClusters("min-branch-clusters", cl::Hidden, cl::cat(BoltOptCategory)); +enum PeepholeOpts : char { + PEEP_NONE = 0x0, + PEEP_SHORTEN = 0x1, + PEEP_DOUBLE_JUMPS = 0x2, + PEEP_TAILCALL_TRAPS = 0x4, + PEEP_USELESS_BRANCHES = 0x8, + PEEP_ALL = 0xf +}; + +static cl::list +Peepholes("peepholes", + cl::CommaSeparated, + cl::desc("enable peephole optimizations"), + cl::value_desc("opt1,opt2,opt3,..."), + cl::values( + clEnumValN(PEEP_NONE, "none", "disable peepholes"), + clEnumValN(PEEP_SHORTEN, "shorten", "perform instruction shortening"), + clEnumValN(PEEP_DOUBLE_JUMPS, "double-jumps", + "remove double jumps when able"), + clEnumValN(PEEP_TAILCALL_TRAPS, "tailcall-traps", "insert tail call traps"), + clEnumValN(PEEP_USELESS_BRANCHES, "useless-branches", + "remove useless conditional branches"), + clEnumValN(PEEP_ALL, "all", "enable all peephole optimizations"), + clEnumValEnd), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::opt PrintFuncStat("print-function-statistics", cl::desc("print statistics about basic block ordering"), @@ -140,6 +168,14 @@ ReorderBlocks("reorder-blocks", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static cl::opt +ReportStaleFuncs("report-stale", + cl::desc("print the list of functions with stale profile"), + cl::init(false), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + enum SctcModes : char { SctcAlways, SctcPreserveDirection, @@ -178,32 +214,14 @@ TSPThreshold("tsp-threshold", cl::Hidden, cl::cat(BoltOptCategory)); -enum PeepholeOpts : char { - PEEP_NONE = 0x0, - PEEP_SHORTEN = 0x1, - PEEP_DOUBLE_JUMPS = 0x2, - PEEP_TAILCALL_TRAPS = 0x4, - PEEP_USELESS_BRANCHES = 0x8, - PEEP_ALL = 0xf -}; - -static cl::list -Peepholes("peepholes", - cl::CommaSeparated, - cl::desc("enable peephole optimizations"), - cl::value_desc("opt1,opt2,opt3,..."), - cl::values( - clEnumValN(PEEP_NONE, "none", "disable peepholes"), - clEnumValN(PEEP_SHORTEN, "shorten", "perform instruction shortening"), - clEnumValN(PEEP_DOUBLE_JUMPS, "double-jumps", - "remove double jumps when able"), - clEnumValN(PEEP_TAILCALL_TRAPS, "tailcall-traps", "insert tail call traps"), - clEnumValN(PEEP_USELESS_BRANCHES, "useless-branches", - "remove useless conditional branches"), - clEnumValN(PEEP_ALL, "all", "enable all peephole optimizations"), - clEnumValEnd), +static cl::opt +TopCalledLimit("top-called-limit", + cl::desc("maximum number of functions to print in top called " + "functions section"), + cl::init(100), cl::ZeroOrMore, - cl::cat(BoltOptCategory)); + cl::Hidden, + cl::cat(BoltCategory)); } // namespace opts @@ -861,6 +879,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, MIA->setConditionalTailCall(*CondBranch); // Add info abount the conditional tail call frequency, otherwise this // info will be lost when we delete the associated BranchInfo entry + BC.MIA->removeAnnotation(*CondBranch, "CTCTakenCount"); BC.MIA->addAnnotation(BC.Ctx.get(), *CondBranch, "CTCTakenCount", CTCTakenFreq); @@ -1315,11 +1334,93 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC, } } -void PrintSortedBy::runOnFunctions( - BinaryContext &, - std::map &BFs, - std::set & -) { +void +PrintProgramStats::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &) { + uint64_t NumSimpleFunctions{0}; + uint64_t NumStaleProfileFunctions{0}; + std::vector ProfiledFunctions; + const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; + for (auto &BFI : BFs) { + auto &Function = BFI.second; + if (!Function.isSimple()) + continue; + ++NumSimpleFunctions; + if (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) + continue; + if (Function.hasValidProfile()) + ProfiledFunctions.push_back(&Function); + else { + if (opts::ReportStaleFuncs) { + outs() << StaleFuncsHeader; + StaleFuncsHeader = ""; + outs() << " " << Function << '\n'; + } + ++NumStaleProfileFunctions; + } + } + BC.NumProfiledFuncs = ProfiledFunctions.size(); + + const auto NumAllProfiledFunctions = + ProfiledFunctions.size() + NumStaleProfileFunctions; + outs() << "BOLT-INFO: " + << NumAllProfiledFunctions + << " functions out of " << NumSimpleFunctions << " simple functions (" + << format("%.1f", NumAllProfiledFunctions / + (float) NumSimpleFunctions * 100.0f) + << "%) have non-empty execution profile.\n"; + if (NumStaleProfileFunctions) { + outs() << "BOLT-INFO: " << NumStaleProfileFunctions + << format(" (%.1f%% of all profiled)", + NumStaleProfileFunctions / + (float) NumAllProfiledFunctions * 100.0f) + << " function" << (NumStaleProfileFunctions == 1 ? "" : "s") + << " have invalid (possibly stale) profile.\n"; + } + + // Profile is marked as 'Used' if it either matches a function name + // exactly or if it 100% matches any of functions with matching common + // LTO names. + auto getUnusedObjects = [&]() -> Optional> { + std::vector UnusedObjects; + for (const auto &Func : BC.DR.getAllFuncsData()) { + if (!Func.getValue().Used) { + UnusedObjects.emplace_back(Func.getKey()); + } + } + if (UnusedObjects.empty()) + return NoneType(); + return UnusedObjects; + }; + + if (const auto UnusedObjects = getUnusedObjects()) { + outs() << "BOLT-INFO: profile for " << UnusedObjects->size() + << " objects was ignored\n"; + if (opts::Verbosity >= 1) { + for (auto Name : *UnusedObjects) { + outs() << " " << Name << '\n'; + } + } + } + + if (ProfiledFunctions.size() > 10) { + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: top called functions are:\n"; + std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(), + [](BinaryFunction *A, BinaryFunction *B) { + return B->getExecutionCount() < A->getExecutionCount(); + } + ); + auto SFI = ProfiledFunctions.begin(); + auto SFIend = ProfiledFunctions.end(); + for (auto i = 0u; i < opts::TopCalledLimit && SFI != SFIend; ++SFI, ++i) { + outs() << " " << **SFI << " : " + << (*SFI)->getExecutionCount() << '\n'; + } + } + } + if (!opts::PrintSortedBy.empty() && std::find(opts::PrintSortedBy.begin(), opts::PrintSortedBy.end(), diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index 3316afd2c808..ea7376f7997b 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -382,17 +382,15 @@ public: std::set &LargeFunctions) override; }; -/// /// Prints a list of the top 100 functions sorted by a set of /// dyno stats categories. -/// -class PrintSortedBy : public BinaryFunctionPass { +class PrintProgramStats : public BinaryFunctionPass { public: - explicit PrintSortedBy(const cl::opt &PrintPass) + explicit PrintProgramStats(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) { } const char *getName() const override { - return "print-sorted-by"; + return "print-stats"; } bool shouldPrint(const BinaryFunction &) const override { return false; diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index fc0792c7c760..f76564355c7b 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -1295,7 +1295,8 @@ void IndirectCallPromotion::runOnFunctions( if (BC.MIA->isCall(Inst) && BC.MIA->getTargetSymbol(Inst, 0)) continue; - assert(BC.MIA->isCall(Inst) || BC.MIA->isIndirectBranch(Inst)); + assert((BC.MIA->isCall(Inst) || BC.MIA->isIndirectBranch(Inst)) + && "expected a call or an indirect jump instruction"); if (IsJumpTable) ++TotalJumpTableCallsites; diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/Passes/ReorderFunctions.cpp index f7d4ffa55dd1..3535588773d5 100644 --- a/bolt/Passes/ReorderFunctions.cpp +++ b/bolt/Passes/ReorderFunctions.cpp @@ -27,29 +27,29 @@ extern cl::opt RandomSeed; extern bool shouldProcess(const bolt::BinaryFunction &Function); extern size_t padFunction(const bolt::BinaryFunction &Function); -cl::opt +cl::opt ReorderFunctions("reorder-functions", cl::desc("reorder and cluster functions (works only with relocations)"), - cl::init(bolt::BinaryFunction::RT_NONE), - cl::values(clEnumValN(bolt::BinaryFunction::RT_NONE, + cl::init(bolt::ReorderFunctions::RT_NONE), + cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE, "none", "do not reorder functions"), - clEnumValN(bolt::BinaryFunction::RT_EXEC_COUNT, + clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT, "exec-count", "order by execution count"), - clEnumValN(bolt::BinaryFunction::RT_HFSORT, + clEnumValN(bolt::ReorderFunctions::RT_HFSORT, "hfsort", "use hfsort algorithm"), - clEnumValN(bolt::BinaryFunction::RT_HFSORT_PLUS, + clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS, "hfsort+", "use hfsort+ algorithm"), - clEnumValN(bolt::BinaryFunction::RT_PETTIS_HANSEN, + clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN, "pettis-hansen", "use Pettis-Hansen algorithm"), - clEnumValN(bolt::BinaryFunction::RT_RANDOM, + clEnumValN(bolt::ReorderFunctions::RT_RANDOM, "random", "reorder functions randomly"), - clEnumValN(bolt::BinaryFunction::RT_USER, + clEnumValN(bolt::ReorderFunctions::RT_USER, "user", "use function order specified by -function-order"), clEnumValEnd), @@ -142,7 +142,7 @@ void ReorderFunctions::reorder(std::vector &&Clusters, } } - if (opts::ReorderFunctions == BinaryFunction::RT_NONE) + if (opts::ReorderFunctions == RT_NONE) return; if (opts::Verbosity == 0) { @@ -280,15 +280,15 @@ std::vector readFunctionOrderFile() { void ReorderFunctions::runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) { - if (!BC.HasRelocations && opts::ReorderFunctions != BinaryFunction::RT_NONE) { + if (!BC.HasRelocations && opts::ReorderFunctions != RT_NONE) { errs() << "BOLT-ERROR: Function reordering only works when " << "relocs are enabled.\n"; exit(1); } - if (opts::ReorderFunctions != BinaryFunction::RT_NONE && - opts::ReorderFunctions != BinaryFunction::RT_EXEC_COUNT && - opts::ReorderFunctions != BinaryFunction::RT_USER) { + if (opts::ReorderFunctions != RT_NONE && + opts::ReorderFunctions != RT_EXEC_COUNT && + opts::ReorderFunctions != RT_USER) { Cg = buildCallGraph(BC, BFs, [this](const BinaryFunction &BF) { @@ -306,9 +306,9 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, std::vector Clusters; switch(opts::ReorderFunctions) { - case BinaryFunction::RT_NONE: + case RT_NONE: break; - case BinaryFunction::RT_EXEC_COUNT: + case RT_EXEC_COUNT: { std::vector SortedFunctions(BFs.size()); uint32_t Index = 0; @@ -340,20 +340,20 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, } } break; - case BinaryFunction::RT_HFSORT: + case RT_HFSORT: Clusters = clusterize(Cg); break; - case BinaryFunction::RT_HFSORT_PLUS: + case RT_HFSORT_PLUS: Clusters = hfsortPlus(Cg, opts::UseGainCache); break; - case BinaryFunction::RT_PETTIS_HANSEN: + case RT_PETTIS_HANSEN: Clusters = pettisAndHansen(Cg); break; - case BinaryFunction::RT_RANDOM: + case RT_RANDOM: std::srand(opts::RandomSeed); Clusters = randomClusters(Cg); break; - case BinaryFunction::RT_USER: + case RT_USER: { uint32_t Index = 0; for (const auto &Function : readFunctionOrderFile()) { @@ -394,7 +394,8 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, if (!BF->hasValidIndex()) { BF->setIndex(Index++); } else if (opts::Verbosity > 0) { - errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function << ".\n"; + errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function + << ".\n"; } } } diff --git a/bolt/Passes/ReorderFunctions.h b/bolt/Passes/ReorderFunctions.h index 57f804ae2290..1b1c58021dfd 100644 --- a/bolt/Passes/ReorderFunctions.h +++ b/bolt/Passes/ReorderFunctions.h @@ -24,7 +24,17 @@ class ReorderFunctions : public BinaryFunctionPass { void reorder(std::vector &&Clusters, std::map &BFs); - public: +public: + enum ReorderType : char { + RT_NONE = 0, + RT_EXEC_COUNT, + RT_HFSORT, + RT_HFSORT_PLUS, + RT_PETTIS_HANSEN, + RT_RANDOM, + RT_USER + }; + explicit ReorderFunctions(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) { } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 733a52a2eb40..22ffe9f273c0 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -217,14 +217,6 @@ RelocationMode("relocs", cl::ZeroOrMore, cl::cat(BoltCategory)); -static cl::opt -ReportStaleFuncs("report-stale", - cl::desc("print a list of functions with a stale profile"), - cl::init(false), - cl::ZeroOrMore, - cl::Hidden, - cl::cat(BoltCategory)); - static cl::list SkipFunctionNames("skip-funcs", cl::CommaSeparated, @@ -255,15 +247,6 @@ SplitFunctions("split-functions", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt -TopCalledLimit("top-called-limit", - cl::desc("maximum number of functions to print in top called " - "functions section"), - cl::init(100), - cl::ZeroOrMore, - cl::Hidden, - cl::cat(BoltCategory)); - cl::opt TrapOldCode("trap-old-code", cl::desc("insert traps in old function bodies (relocation mode)"), @@ -572,7 +555,8 @@ createBinaryContext(ELFObjectFileBase *File, DataReader &DR, std::unique_ptr MII(TheTarget->createMCInstrInfo()); if (!MII) { - errs() << "BOLT-ERROR: no instruction info for target " << TripleName << "\n"; + errs() << "BOLT-ERROR: no instruction info for target " << TripleName + << "\n"; return nullptr; } @@ -666,19 +650,6 @@ void RewriteInstance::reset() { FailedAddresses.clear(); RangesSectionsWriter.reset(); LocationListWriter.reset(); - TotalScore = 0; -} - -void RewriteInstance::aggregateData() { - NamedRegionTimer T("aggregate data", TimerGroupName, opts::TimeRewrite); - DA.aggregate(*BC.get(), BinaryFunctions); - - if (!opts::AggregateOnly) - return; - - if (std::error_code EC = DA.writeAggregatedFile()) { - check_error(EC, "cannot create output data file"); - } } void RewriteInstance::discoverStorage() { @@ -901,13 +872,11 @@ void RewriteInstance::run() { readSpecialSections(); discoverFileObjects(); readDebugInfo(); - readProfileData(); disassembleFunctions(); - if (DA.started()) { - aggregateData(); - if (opts::AggregateOnly) - return; - } + readProfileData(); + if (opts::AggregateOnly) + return; + postProcessFunctions(); for (uint64_t Address : NonSimpleFunctions) { auto FI = BinaryFunctions.find(Address); assert(FI != BinaryFunctions.end() && "bad non-simple function address"); @@ -1930,30 +1899,44 @@ void RewriteInstance::readDebugInfo() { } void RewriteInstance::readProfileData() { - NamedRegionTimer T("read profile data", TimerGroupName, opts::TimeRewrite); - if (BC->DR.getAllFuncsData().empty()) + if (DA.started()) { + NamedRegionTimer T("aggregate data", TimerGroupName, opts::TimeRewrite); + DA.aggregate(*BC.get(), BinaryFunctions); + + if (opts::AggregateOnly) { + if (std::error_code EC = DA.writeAggregatedFile()) { + check_error(EC, "cannot create output data file"); + } + } return; + } + + NamedRegionTimer T("read profile data", TimerGroupName, opts::TimeRewrite); + // Preliminary match profile data to functions. + if (!BC->DR.getAllFuncsData().empty()) { + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + if (auto *MemData = BC->DR.getFuncMemData(Function.getNames())) { + Function.MemData = MemData; + MemData->Used = true; + } + if (auto *FuncData = BC->DR.getFuncBranchData(Function.getNames())) { + Function.BranchData = FuncData; + Function.ExecutionCount = FuncData->ExecutionCount; + FuncData->Used = true; + } + } + } for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; - if (auto *MemData = BC->DR.getFuncMemData(Function.getNames())) { - Function.MemData = MemData; - MemData->Used = true; - } - if (auto *FuncData = BC->DR.getFuncBranchData(Function.getNames())) { - Function.BranchData = FuncData; - Function.ExecutionCount = FuncData->ExecutionCount; - FuncData->Used = true; - } + Function.readProfile(); } } void RewriteInstance::disassembleFunctions() { NamedRegionTimer T("disassemble functions", TimerGroupName, opts::TimeRewrite); - // Disassemble every function and build it's control flow graph. - TotalScore = 0; - BC->SumExecutionCount = 0; for (auto &BFI : BinaryFunctions) { BinaryFunction &Function = BFI.second; @@ -1965,7 +1948,6 @@ void RewriteInstance::disassembleFunctions() { } auto FunctionData = BC->getFunctionData(Function); - if (!FunctionData) { // When could it happen? errs() << "BOLT-ERROR: corresponding section is non-executable or " @@ -1980,7 +1962,7 @@ void RewriteInstance::disassembleFunctions() { } // Offset of the function in the file. - auto *FileBegin = + const auto *FileBegin = reinterpret_cast(InputFile->getData().data()); Function.setFileOffset(FunctionData->begin() - FileBegin); @@ -2049,9 +2031,6 @@ void RewriteInstance::disassembleFunctions() { } BC->InterproceduralReferences.clear(); - if (opts::AggregateOnly) - continue; - // Fill in CFI information for this function if (Function.isSimple()) { if (!CFIRdWrt->fillCFIInfoFor(Function)) { @@ -2071,6 +2050,23 @@ void RewriteInstance::disassembleFunctions() { if (!Function.buildCFG()) continue; + if (opts::PrintAll) + Function.print(outs(), "while building cfg", true); + + } // Iterate over all functions +} + +void RewriteInstance::postProcessFunctions() { + BC->TotalScore = 0; + BC->SumExecutionCount = 0; + for (auto &BFI : BinaryFunctions) { + BinaryFunction &Function = BFI.second; + + if (Function.empty()) + continue; + + Function.postProcessCFG(); + if (opts::PrintAll || opts::PrintCFG) Function.print(outs(), "after building cfg", true); @@ -2082,95 +2078,8 @@ void RewriteInstance::disassembleFunctions() { Function.printLoopInfo(outs()); } - TotalScore += Function.getFunctionScore(); + BC->TotalScore += Function.getFunctionScore(); BC->SumExecutionCount += Function.getKnownExecutionCount(); - - } // Iterate over all functions - - if (opts::AggregateOnly) - return; - - const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; - uint64_t NumSimpleFunctions{0}; - uint64_t NumStaleProfileFunctions{0}; - std::vector ProfiledFunctions; - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - if (!Function.isSimple()) - continue; - ++NumSimpleFunctions; - if (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) - continue; - if (Function.hasValidProfile()) { - ProfiledFunctions.push_back(&Function); - } else { - if (opts::ReportStaleFuncs) { - outs() << StaleFuncsHeader - << " " << Function << '\n'; - StaleFuncsHeader = ""; - } - ++NumStaleProfileFunctions; - } - } - BC->NumProfiledFuncs = ProfiledFunctions.size(); - - const auto NumAllProfiledFunctions = - ProfiledFunctions.size() + NumStaleProfileFunctions; - outs() << "BOLT-INFO: " - << NumAllProfiledFunctions - << " functions out of " << NumSimpleFunctions << " simple functions (" - << format("%.1f", NumAllProfiledFunctions / - (float) NumSimpleFunctions * 100.0f) - << "%) have non-empty execution profile.\n"; - if (NumStaleProfileFunctions) { - outs() << "BOLT-INFO: " << NumStaleProfileFunctions - << format(" (%.1f%% of all profiled)", - NumStaleProfileFunctions / - (float) NumAllProfiledFunctions * 100.0f) - << " function" << (NumStaleProfileFunctions == 1 ? "" : "s") - << " have invalid (possibly stale) profile.\n"; - } - - // Profile is marked as 'Used' if it either matches a function name - // exactly or if it 100% matches any of functions with matching common - // LTO names. - auto getUnusedObjects = [this]() -> Optional> { - std::vector UnusedObjects; - for (const auto &Func : BC->DR.getAllFuncsData()) { - if (!Func.getValue().Used) { - UnusedObjects.emplace_back(Func.getKey()); - } - } - if (UnusedObjects.empty()) - return NoneType(); - return UnusedObjects; - }; - - if (const auto UnusedObjects = getUnusedObjects()) { - outs() << "BOLT-INFO: profile for " << UnusedObjects->size() - << " objects was ignored\n"; - if (opts::Verbosity >= 1) { - for (auto Name : *UnusedObjects) { - outs() << " " << Name << '\n'; - } - } - } - - if (ProfiledFunctions.size() > 10) { - if (opts::Verbosity >= 1) { - outs() << "BOLT-INFO: top called functions are:\n"; - std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(), - [](BinaryFunction *A, BinaryFunction *B) { - return B->getExecutionCount() < A->getExecutionCount(); - } - ); - auto SFI = ProfiledFunctions.begin(); - auto SFIend = ProfiledFunctions.end(); - for (auto i = 0u; i < opts::TopCalledLimit && SFI != SFIend; ++SFI, ++i) { - outs() << " " << **SFI << " : " - << (*SFI)->getExecutionCount() << '\n'; - } - } } } @@ -3861,8 +3770,8 @@ void RewriteInstance::rewriteFile() { outs() << "BOLT: " << CountOverwrittenFunctions << " out of " << BinaryFunctions.size() << " functions were overwritten.\n"; - if (TotalScore != 0) { - double Coverage = OverwrittenScore / (double)TotalScore * 100.0; + if (BC->TotalScore != 0) { + double Coverage = OverwrittenScore / (double) BC->TotalScore * 100.0; outs() << format("BOLT: Rewritten functions cover %.2lf", Coverage) << "% of the execution count of simple functions of " "this binary.\n"; diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 8bc3ad3da294..74c801a27d33 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -186,6 +186,8 @@ public: /// optimization. void disassembleFunctions(); + void postProcessFunctions(); + /// Run optimizations that operate at the binary, or post-linker, level. void runOptimizationPasses(); @@ -277,9 +279,6 @@ private: void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, bool EmitColdPart); - /// Perform a perf.data aggregation job instead of a binary rewriting one - void aggregateData(); - /// Detect addresses and offsets available in the binary for allocating /// new sections. void discoverStorage(); @@ -523,9 +522,6 @@ private: /// last emission, so that we may either decide to split or not optimize them. std::set LargeFunctions; - /// Total hotness score according to profiling data for this binary. - uint64_t TotalScore{0}; - /// Section header string table. StringTableBuilder SHStrTab; diff --git a/bolt/llvm-bolt.cpp b/bolt/llvm-bolt.cpp index 55aa6bb920ff..b283e371dfc6 100644 --- a/bolt/llvm-bolt.cpp +++ b/bolt/llvm-bolt.cpp @@ -178,9 +178,8 @@ int main(int argc, char **argv) { if (!opts::PerfData.empty()) { if (!opts::AggregateOnly) { errs() << ToolName - << ": reading perf data directly is unsupported, please use " - "-aggregate-only or perf2bolt\n"; - exit(1); + << ": WARNING: reading perf data directly is unsupported, please use " + "-aggregate-only or perf2bolt.\n!!! Proceed on your own risk. !!!\n"; } DA->start(opts::PerfData); } else if (!opts::InputDataFilename.empty()) {