llvm-project/bolt/lib/Passes/IndirectCallPromotion.cpp

//===--- BinaryPasses.cpp - Binary-level analysis/optimization passes -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//

#include "bolt/Passes/IndirectCallPromotion.h"
#include "bolt/Passes/BinaryFunctionCallGraph.h"
#include "bolt/Passes/DataflowInfoManager.h"
#include "bolt/Utils/CommandLineOpts.h"
#include "llvm/Support/CommandLine.h"

#define DEBUG_TYPE "ICP"
#define DEBUG_VERBOSE(Level, X) if (opts::Verbosity >= (Level)) { X; }

using namespace llvm;
using namespace bolt;

namespace opts {

extern cl::OptionCategory BoltOptCategory;

extern cl::opt<IndirectCallPromotionType> IndirectCallPromotion;
extern cl::opt<unsigned> Verbosity;
extern cl::opt<unsigned> ExecutionCountThreshold;

static cl::opt<unsigned>
ICPJTRemainingPercentThreshold(
    "icp-jt-remaining-percent-threshold",
    cl::desc("The percentage threshold against remaining unpromoted indirect "
             "call count for the promotion for jump tables"),
    cl::init(30),
    cl::ZeroOrMore,
    cl::Hidden,
    cl::cat(BoltOptCategory));

static cl::opt<unsigned>
ICPJTTotalPercentThreshold(
    "icp-jt-total-percent-threshold",
    cl::desc("The percentage threshold against total count for the promotion for "
             "jump tables"),
    cl::init(5),
    cl::ZeroOrMore,
    cl::Hidden,
    cl::cat(BoltOptCategory));

static cl::opt<unsigned>
ICPCallsRemainingPercentThreshold(
    "icp-calls-remaining-percent-threshold",
    cl::desc("The percentage threshold against remaining unpromoted indirect "
             "call count for the promotion for calls"),
    cl::init(50),
    cl::ZeroOrMore,
    cl::Hidden,
    cl::cat(BoltOptCategory));

static cl::opt<unsigned>
ICPCallsTotalPercentThreshold(
    "icp-calls-total-percent-threshold",
    cl::desc("The percentage threshold against total count for the promotion for "
             "calls"),
    cl::init(30),
    cl::ZeroOrMore,
    cl::Hidden,
    cl::cat(BoltOptCategory));

static cl::opt<unsigned>
IndirectCallPromotionMispredictThreshold(
    "indirect-call-promotion-mispredict-threshold",
    cl::desc("misprediction threshold for skipping ICP on an "
             "indirect call"),
    cl::init(0),
    cl::ZeroOrMore,
    cl::cat(BoltOptCategory));

static cl::opt<bool>
IndirectCallPromotionUseMispredicts(
    "indirect-call-promotion-use-mispredicts",
    cl::desc("use misprediction frequency for determining whether or not ICP "
             "should be applied at a callsite.  The "
             "-indirect-call-promotion-mispredict-threshold value will be used "
             "by this heuristic"),
    cl::ZeroOrMore,
    cl::cat(BoltOptCategory));

static cl::opt<unsigned>
IndirectCallPromotionTopN(
    "indirect-call-promotion-topn",
    cl::desc("limit number of targets to consider when doing indirect "
             "call promotion. 0 = no limit"),
    cl::init(3),
    cl::ZeroOrMore,
    cl::cat(BoltOptCategory));

static cl::opt<unsigned>
IndirectCallPromotionCallsTopN(
    "indirect-call-promotion-calls-topn",
    cl::desc("limit number of targets to consider when doing indirect "
             "call promotion on calls. 0 = no limit"),
    cl::init(0),
    cl::ZeroOrMore,
    cl::cat(BoltOptCategory));

static cl::opt<unsigned>
IndirectCallPromotionJumpTablesTopN(
    "indirect-call-promotion-jump-tables-topn",
    cl::desc("limit number of targets to consider when doing indirect "
             "call promotion on jump tables. 0 = no limit"),
    cl::init(0),
    cl::ZeroOrMore,
    cl::cat(BoltOptCategory));

static cl::opt<bool>
EliminateLoads(
    "icp-eliminate-loads",
    cl::desc("enable load elimination using memory profiling data when "
             "performing ICP"),
    cl::init(true),
    cl::ZeroOrMore,
    cl::cat(BoltOptCategory));

static cl::opt<unsigned>
ICPTopCallsites(
    "icp-top-callsites",
    cl::desc("optimize hottest calls until at least this percentage of all "
             "indirect calls frequency is covered. 0 = all callsites"),
    cl::init(99),
    cl::Hidden,
    cl::ZeroOrMore,
    cl::cat(BoltOptCategory));

static cl::list<std::string>
ICPFuncsList("icp-funcs",
             cl::CommaSeparated,
             cl::desc("list of functions to enable ICP for"),
             cl::value_desc("func1,func2,func3,..."),
             cl::Hidden,
             cl::cat(BoltOptCategory));

static cl::opt<bool>
ICPOldCodeSequence(
    "icp-old-code-sequence",
    cl::desc("use old code sequence for promoted calls"),
    cl::init(false),
    cl::ZeroOrMore,
    cl::Hidden,
    cl::cat(BoltOptCategory));

static cl::opt<bool> ICPJumpTablesByTarget(
    "icp-jump-tables-targets",
    cl::desc(
        "for jump tables, optimize indirect jmp targets instead of indices"),
    cl::init(false), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));

} // namespace opts

namespace llvm {
namespace bolt {

namespace {

bool verifyProfile(std::map<uint64_t, BinaryFunction> &BFs) {
  bool IsValid = true;
  for (auto &BFI : BFs) {
    BinaryFunction &BF = BFI.second;
    if (!BF.isSimple()) continue;
    for (BinaryBasicBlock *BB : BF.layout()) {
      auto BI = BB->branch_info_begin();
      for (BinaryBasicBlock *SuccBB : BB->successors()) {
        if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) {
          if (BB->getKnownExecutionCount() == 0 ||
              SuccBB->getKnownExecutionCount() == 0) {
            errs() << "BOLT-WARNING: profile verification failed after ICP for "
                      "function " << BF << '\n';
            IsValid = false;
          }
        }
        ++BI;
      }
    }
  }
  return IsValid;
}

}

IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF,
                                          const IndirectCallProfile &ICP)
  : From(BF.getSymbol()),
    To(ICP.Offset),
    Mispreds(ICP.Mispreds),
    Branches(ICP.Count) {
  if (ICP.Symbol) {
    To.Sym = ICP.Symbol;
    To.Addr = 0;
  }
}

void IndirectCallPromotion::printDecision(
    llvm::raw_ostream &OS,
    std::vector<IndirectCallPromotion::Callsite> &Targets, unsigned N) const {
  uint64_t TotalCount = 0;
  uint64_t TotalMispreds = 0;
  for (const Callsite &S : Targets) {
    TotalCount += S.Branches;
    TotalMispreds += S.Mispreds;
  }
  if (!TotalCount)
    TotalCount = 1;
  if (!TotalMispreds)
    TotalMispreds = 1;

  OS << "BOLT-INFO: ICP decision for call site with " << Targets.size()
     << " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds
     << "\n";

  size_t I = 0;
  for (const Callsite &S : Targets) {
    OS << "Count = " << S.Branches << ", "
       << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
       << "Mispreds = " << S.Mispreds << ", "
       << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds);
    if (I < N)
      OS << " * to be optimized *";
    if (!S.JTIndices.empty()) {
      OS << " Indices:";
      for (const uint64_t Idx : S.JTIndices)
        OS << " " << Idx;
    }
    OS << "\n";
    I += S.JTIndices.empty() ? 1 : S.JTIndices.size();
  }
}

// Get list of targets for a given call sorted by most frequently
// called first.
std::vector<IndirectCallPromotion::Callsite>
IndirectCallPromotion::getCallTargets(BinaryBasicBlock &BB,
                                      const MCInst &Inst) const {
  BinaryFunction &BF = *BB.getFunction();
  BinaryContext &BC = BF.getBinaryContext();
  std::vector<Callsite> Targets;

  if (const JumpTable *JT = BF.getJumpTable(Inst)) {
    // Don't support PIC jump tables for now
    if (!opts::ICPJumpTablesByTarget && JT->Type == JumpTable::JTT_PIC)
      return Targets;
    const Location From(BF.getSymbol());
    const std::pair<size_t, size_t> Range =
        JT->getEntriesForAddress(BC.MIB->getJumpTable(Inst));
    assert(JT->Counts.empty() || JT->Counts.size() >= Range.second);
    JumpTable::JumpInfo DefaultJI;
    const JumpTable::JumpInfo *JI =
        JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first];
    const size_t JIAdj = JT->Counts.empty() ? 0 : 1;
    assert(JT->Type == JumpTable::JTT_PIC ||
           JT->EntrySize == BC.AsmInfo->getCodePointerSize());
    for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) {
      MCSymbol *Entry = JT->Entries[I];
      assert(BF.getBasicBlockForLabel(Entry) ||
             Entry == BF.getFunctionEndLabel() ||
             Entry == BF.getFunctionColdEndLabel());
      if (Entry == BF.getFunctionEndLabel() ||
          Entry == BF.getFunctionColdEndLabel())
        continue;
      const Location To(Entry);
      const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(Entry);
      Targets.emplace_back(
          From, To, BI.MispredictedCount, BI.Count, I - Range.first);
    }

    // Sort by symbol then addr.
    std::sort(Targets.begin(), Targets.end(),
              [](const Callsite &A, const Callsite &B) {
                if (A.To.Sym && B.To.Sym)
                  return A.To.Sym < B.To.Sym;
                else if (A.To.Sym && !B.To.Sym)
                  return true;
                else if (!A.To.Sym && B.To.Sym)
                  return false;
                else
                  return A.To.Addr < B.To.Addr;
              });

    // Targets may contain multiple entries to the same target, but using
    // different indices. Their profile will report the same number of branches
    // for different indices if the target is the same. That's because we don't
    // profile the index value, but only the target via LBR.
    auto First = Targets.begin();
    auto Last = Targets.end();
    auto Result = First;
    while (++First != Last) {
      Callsite &A = *Result;
      const Callsite &B = *First;
      if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym) {
        A.JTIndices.insert(A.JTIndices.end(), B.JTIndices.begin(),
                           B.JTIndices.end());
      } else {
        *(++Result) = *First;
      }
    }
    ++Result;

    LLVM_DEBUG(if (Targets.end() - Result > 0) {
      dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result)
             << " duplicate targets removed\n";
    });

    Targets.erase(Result, Targets.end());
  } else {
    // Don't try to optimize PC relative indirect calls.
    if (Inst.getOperand(0).isReg() &&
        Inst.getOperand(0).getReg() == BC.MRI->getProgramCounter()) {
      return Targets;
    }
    auto ICSP =
      BC.MIB->tryGetAnnotationAs<IndirectCallSiteProfile>(Inst, "CallProfile");
    if (ICSP) {
      for (const IndirectCallProfile &CSP : ICSP.get()) {
        Callsite Site(BF, CSP);
        if (Site.isValid())
          Targets.emplace_back(std::move(Site));
      }
    }
  }

  // Sort by target count, number of indices in case of jump table,  and
  // mispredicts. We prioritize targets with high count, small number of
  // indices and high mispredicts
  std::stable_sort(Targets.begin(), Targets.end(),
                   [](const Callsite &A, const Callsite &B) {
                     if (A.Branches != B.Branches)
                       return A.Branches > B.Branches;
                     else if (A.JTIndices.size() != B.JTIndices.size())
                       return A.JTIndices.size() < B.JTIndices.size();
                     else
                       return A.Mispreds > B.Mispreds;
                   });

  // Remove non-symbol targets
  auto Last = std::remove_if(Targets.begin(),
                             Targets.end(),
                             [](const Callsite &CS) {
                               return !CS.To.Sym;
                             });
  Targets.erase(Last, Targets.end());

  LLVM_DEBUG(
    if (BF.getJumpTable(Inst)) {
      uint64_t TotalCount = 0;
      uint64_t TotalMispreds = 0;
      for (const Callsite &S : Targets) {
        TotalCount += S.Branches;
        TotalMispreds += S.Mispreds;
      }
      if (!TotalCount) TotalCount = 1;
      if (!TotalMispreds) TotalMispreds = 1;

      dbgs() << "BOLT-INFO: ICP: jump table size = " << Targets.size()
             << ", Count = " << TotalCount
             << ", Mispreds = " << TotalMispreds << "\n";

      size_t I = 0;
      for (const Callsite &S : Targets) {
        dbgs () << "Count[" << I << "] = " << S.Branches << ", "
                << format("%.1f", (100.0*S.Branches)/TotalCount) << ", "
                << "Mispreds[" << I << "] = " << S.Mispreds << ", "
                << format("%.1f", (100.0*S.Mispreds)/TotalMispreds) << "\n";
        ++I;
      }
    });

  return Targets;
}

IndirectCallPromotion::JumpTableInfoType
IndirectCallPromotion::maybeGetHotJumpTableTargets(BinaryBasicBlock &BB,
                                                   MCInst &CallInst,
                                                   MCInst *&TargetFetchInst,
                                                   const JumpTable *JT) const {
  assert(JT && "Can't get jump table addrs for non-jump tables.");

  BinaryFunction &Function = *BB.getFunction();
  BinaryContext &BC = Function.getBinaryContext();

  if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
    return JumpTableInfoType();

  JumpTableInfoType HotTargets;
  MCInst *MemLocInstr;
  MCInst *PCRelBaseOut;
  unsigned BaseReg, IndexReg;
  int64_t DispValue;
  const MCExpr *DispExpr;
  MutableArrayRef<MCInst> Insts(&BB.front(), &CallInst);
  const IndirectBranchType Type = BC.MIB->analyzeIndirectBranch(
      CallInst, Insts.begin(), Insts.end(), BC.AsmInfo->getCodePointerSize(),
      MemLocInstr, BaseReg, IndexReg, DispValue, DispExpr, PCRelBaseOut);

  assert(MemLocInstr && "There should always be a load for jump tables");
  if (!MemLocInstr)
    return JumpTableInfoType();

  LLVM_DEBUG({
    dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for "
           << "jump table in " << Function << " at @ "
           << (&CallInst - &BB.front()) << "\n"
           << "BOLT-INFO: ICP target fetch instructions:\n";
    BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
    if (MemLocInstr != &CallInst) {
      BC.printInstruction(dbgs(), CallInst, 0, &Function);
    }
  });

  DEBUG_VERBOSE(1, {
      dbgs() << "Jmp info: Type = " << (unsigned)Type << ", "
             << "BaseReg = " << BC.MRI->getName(BaseReg) << ", "
             << "IndexReg = " << BC.MRI->getName(IndexReg) << ", "
             << "DispValue = " << Twine::utohexstr(DispValue) << ", "
             << "DispExpr = " << DispExpr << ", "
             << "MemLocInstr = ";
      BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
      dbgs() << "\n";
    });

  ++TotalIndexBasedCandidates;

  auto ErrorOrMemAccesssProfile =
    BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MemLocInstr,
                                                    "MemoryAccessProfile");
  if (!ErrorOrMemAccesssProfile) {
    DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP no memory profiling data found\n");
    return JumpTableInfoType();
  }
  MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccesssProfile.get();

  uint64_t ArrayStart;
  if (DispExpr) {
    ErrorOr<uint64_t> DispValueOrError =
        BC.getSymbolValue(*BC.MIB->getTargetSymbol(DispExpr));
    assert(DispValueOrError && "global symbol needs a value");
    ArrayStart = *DispValueOrError;
  } else {
    ArrayStart = static_cast<uint64_t>(DispValue);
  }

  if (BaseReg == BC.MRI->getProgramCounter()) {
    ArrayStart += Function.getAddress() + MemAccessProfile.NextInstrOffset;
  }

  // This is a map of [symbol] -> [count, index] and is used to combine indices
  // into the jump table since there may be multiple addresses that all have the
  // same entry.
  std::map<MCSymbol *, std::pair<uint64_t, uint64_t>> HotTargetMap;
  const std::pair<size_t, size_t> Range = JT->getEntriesForAddress(ArrayStart);

  for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
    size_t Index;
    // Mem data occasionally includes nullprs, ignore them.
    if (!AccessInfo.MemoryObject && !AccessInfo.Offset)
      continue;

    if (AccessInfo.Offset % JT->EntrySize != 0) // ignore bogus data
      return JumpTableInfoType();

    if (AccessInfo.MemoryObject) {
      // Deal with bad/stale data
      if (!AccessInfo.MemoryObject->getName().
           startswith("JUMP_TABLE/" + Function.getOneName().str()))
        return JumpTableInfoType();
      Index =
        (AccessInfo.Offset - (ArrayStart - JT->getAddress())) / JT->EntrySize;
    } else {
      Index = (AccessInfo.Offset - ArrayStart) / JT->EntrySize;
    }

    // If Index is out of range it probably means the memory profiling data is
    // wrong for this instruction, bail out.
    if (Index >= Range.second) {
      LLVM_DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first
                        << ", " << Range.second << "\n");
      return JumpTableInfoType();
    }

    // Make sure the hot index points at a legal label corresponding to a BB,
    // e.g. not the end of function (unreachable) label.
    if (!Function.getBasicBlockForLabel(JT->Entries[Index + Range.first])) {
      LLVM_DEBUG({
          dbgs() << "BOLT-INFO: hot index " << Index << " pointing at bogus "
                 << "label " << JT->Entries[Index + Range.first]->getName()
                 << " in jump table:\n";
          JT->print(dbgs());
          dbgs() << "HotTargetMap:\n";
          for (std::pair<MCSymbol *const, std::pair<uint64_t, uint64_t>> &HT :
               HotTargetMap) {
            dbgs() << "BOLT-INFO: " << HT.first->getName()
                   << " = (count=" << HT.first << ", index=" << HT.second
                   << ")\n";
          }
        });
      return JumpTableInfoType();
    }

    std::pair<uint64_t, uint64_t> &HotTarget =
        HotTargetMap[JT->Entries[Index + Range.first]];
    HotTarget.first += AccessInfo.Count;
    HotTarget.second = Index;
  }

  std::transform(
     HotTargetMap.begin(),
     HotTargetMap.end(),
     std::back_inserter(HotTargets),
     [](const std::pair<MCSymbol *, std::pair<uint64_t, uint64_t>> &A) {
       return A.second;
     });

  // Sort with highest counts first.
  std::sort(HotTargets.rbegin(), HotTargets.rend());

  LLVM_DEBUG({
      dbgs() << "BOLT-INFO: ICP jump table hot targets:\n";
      for (const std::pair<uint64_t, uint64_t> &Target : HotTargets) {
        dbgs() << "BOLT-INFO:  Idx = " << Target.second << ", "
               << "Count = " << Target.first << "\n";
      }
    });

  BC.MIB->getOrCreateAnnotationAs<uint16_t>(CallInst, "JTIndexReg") = IndexReg;

  TargetFetchInst = MemLocInstr;

  return HotTargets;
}

IndirectCallPromotion::SymTargetsType
IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets,
                                             size_t &N, BinaryBasicBlock &BB,
                                             MCInst &CallInst,
                                             MCInst *&TargetFetchInst) const {
  const JumpTable *JT = BB.getFunction()->getJumpTable(CallInst);
  SymTargetsType SymTargets;

  if (JT) {
    JumpTableInfoType HotTargets =
        maybeGetHotJumpTableTargets(BB, CallInst, TargetFetchInst, JT);

    if (!HotTargets.empty()) {
      auto findTargetsIndex = [&](uint64_t JTIndex) {
        for (size_t I = 0; I < Targets.size(); ++I) {
          std::vector<uint64_t> &JTIs = Targets[I].JTIndices;
          if (std::find(JTIs.begin(), JTIs.end(), JTIndex) != JTIs.end())
            return I;
        }
        LLVM_DEBUG(
            dbgs() << "BOLT-ERROR: Unable to find target index for hot jump "
                   << " table entry in " << *BB.getFunction() << "\n");
        llvm_unreachable("Hot indices must be referred to by at least one "
                         "callsite");
      };

      if (opts::Verbosity >= 1) {
        for (size_t I = 0; I < HotTargets.size(); ++I) {
          outs() << "BOLT-INFO: HotTarget[" << I << "] = ("
                 << HotTargets[I].first << ", " << HotTargets[I].second << ")\n";
        }
      }

      // Recompute hottest targets, now discriminating which index is hot
      // NOTE: This is a tradeoff. On one hand, we get index information. On the
      // other hand, info coming from the memory profile is much less accurate
      // than LBRs. So we may actually end up working with more coarse
      // profile granularity in exchange for information about indices.
      std::vector<Callsite> NewTargets;
      std::map<const MCSymbol *, uint32_t> IndicesPerTarget;
      uint64_t TotalMemAccesses = 0;
      for (size_t I = 0; I < HotTargets.size(); ++I) {
        const uint64_t TargetIndex = findTargetsIndex(HotTargets[I].second);
        ++IndicesPerTarget[Targets[TargetIndex].To.Sym];
        TotalMemAccesses += HotTargets[I].first;
      }
      uint64_t RemainingMemAccesses = TotalMemAccesses;
      const size_t TopN = opts::IndirectCallPromotionJumpTablesTopN != 0
                              ? opts::IndirectCallPromotionTopN
                              : opts::IndirectCallPromotionTopN;
      size_t I = 0;
      for (; I < HotTargets.size(); ++I) {
        const uint64_t MemAccesses = HotTargets[I].first;
        if (100 * MemAccesses <
            TotalMemAccesses * opts::ICPJTTotalPercentThreshold)
          break;
        if (100 * MemAccesses <
            RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold)
          break;
        if (TopN && I >= TopN)
          break;
        RemainingMemAccesses -= MemAccesses;

        const uint64_t JTIndex = HotTargets[I].second;
        Callsite &Target = Targets[findTargetsIndex(JTIndex)];

        NewTargets.push_back(Target);
        std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndices);
        Target.JTIndices.erase(std::remove(Target.JTIndices.begin(),
                                           Target.JTIndices.end(), JTIndex),
                               Target.JTIndices.end());

        // Keep fixCFG counts sane if more indices use this same target later
        assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map");
        NewTargets.back().Branches =
            Target.Branches / IndicesPerTarget[Target.To.Sym];
        NewTargets.back().Mispreds =
            Target.Mispreds / IndicesPerTarget[Target.To.Sym];
        assert(Target.Branches >= NewTargets.back().Branches);
        assert(Target.Mispreds >= NewTargets.back().Mispreds);
        Target.Branches -= NewTargets.back().Branches;
        Target.Mispreds -= NewTargets.back().Mispreds;
      }
      std::copy(Targets.begin(), Targets.end(), std::back_inserter(NewTargets));
      std::swap(NewTargets, Targets);
      N = I;

      if (N == 0 && opts::Verbosity >= 1) {
        outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in "
               << BB.getName()
               << ": failed to meet thresholds after memory profile data was "
                  "loaded.\n";
        return SymTargets;
      }
    }

    for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) {
      Callsite &Target = Targets[TgtIdx];
      assert(Target.To.Sym && "All ICP targets must be to known symbols");
      assert(!Target.JTIndices.empty() && "Jump tables must have indices");
      for (uint64_t Idx : Target.JTIndices) {
        SymTargets.emplace_back(Target.To.Sym, Idx);
        ++I;
      }
    }
  } else {
    for (size_t I = 0; I < N; ++I) {
      assert(Targets[I].To.Sym &&
             "All ICP targets must be to known symbols");
      assert(Targets[I].JTIndices.empty() &&
             "Can't have jump table indices for non-jump tables");
      SymTargets.emplace_back(Targets[I].To.Sym, 0);
    }
  }

  return SymTargets;
}

IndirectCallPromotion::MethodInfoType IndirectCallPromotion::maybeGetVtableSyms(
    BinaryBasicBlock &BB, MCInst &Inst,
    const SymTargetsType &SymTargets) const {
  BinaryFunction &Function = *BB.getFunction();
  BinaryContext &BC = Function.getBinaryContext();
  std::vector<std::pair<MCSymbol *, uint64_t>> VtableSyms;
  std::vector<MCInst *> MethodFetchInsns;
  unsigned VtableReg, MethodReg;
  uint64_t MethodOffset;

  assert(!Function.getJumpTable(Inst) &&
         "Can't get vtable addrs for jump tables.");

  if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
    return MethodInfoType();

  MutableArrayRef<MCInst> Insts(&BB.front(), &Inst + 1);
  if (!BC.MIB->analyzeVirtualMethodCall(Insts.begin(),
                                        Insts.end(),
                                        MethodFetchInsns,
                                        VtableReg,
                                        MethodReg,
                                        MethodOffset)) {
    DEBUG_VERBOSE(
        1, dbgs() << "BOLT-INFO: ICP unable to analyze method call in "
                  << Function << " at @ " << (&Inst - &BB.front()) << "\n");
    return MethodInfoType();
  }

  ++TotalMethodLoadEliminationCandidates;

  DEBUG_VERBOSE(1, {
    dbgs() << "BOLT-INFO: ICP found virtual method call in " << Function
           << " at @ " << (&Inst - &BB.front()) << "\n";
    dbgs() << "BOLT-INFO: ICP method fetch instructions:\n";
    for (MCInst *Inst : MethodFetchInsns) {
      BC.printInstruction(dbgs(), *Inst, 0, &Function);
    }
    if (MethodFetchInsns.back() != &Inst) {
      BC.printInstruction(dbgs(), Inst, 0, &Function);
    }
  });

  // Try to get value profiling data for the method load instruction.
  auto ErrorOrMemAccesssProfile =
    BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MethodFetchInsns.back(),
                                                    "MemoryAccessProfile");
  if (!ErrorOrMemAccesssProfile) {
    DEBUG_VERBOSE(1,
                  dbgs() << "BOLT-INFO: ICP no memory profiling data found\n");
    return MethodInfoType();
  }
  MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccesssProfile.get();

  // Find the vtable that each method belongs to.
  std::map<const MCSymbol *, uint64_t> MethodToVtable;

  for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
    uint64_t Address = AccessInfo.Offset;
    if (AccessInfo.MemoryObject) {
      Address += AccessInfo.MemoryObject->getAddress();
    }

    // Ignore bogus data.
    if (!Address)
      continue;

    const uint64_t VtableBase = Address - MethodOffset;

    DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = "
                            << Twine::utohexstr(VtableBase)
                            << "+" << MethodOffset << "/" << AccessInfo.Count
                            << "\n");

    if (ErrorOr<uint64_t> MethodAddr = BC.getPointerAtAddress(Address)) {
      BinaryData *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get());
      if (!MethodBD)  // skip unknown methods
        continue;
      MCSymbol *MethodSym = MethodBD->getSymbol();
      MethodToVtable[MethodSym] = VtableBase;
      DEBUG_VERBOSE(1, {
        const BinaryFunction *Method = BC.getFunctionForSymbol(MethodSym);
        dbgs() << "BOLT-INFO: ICP found method = "
               << Twine::utohexstr(MethodAddr.get()) << "/"
               << (Method ? Method->getPrintName() : "") << "\n";
      });
    }
  }

  // Find the vtable for each target symbol.
  for (size_t I = 0; I < SymTargets.size(); ++I) {
    auto Itr = MethodToVtable.find(SymTargets[I].first);
    if (Itr != MethodToVtable.end()) {
      if (BinaryData *BD = BC.getBinaryDataContainingAddress(Itr->second)) {
        const uint64_t Addend = Itr->second - BD->getAddress();
        VtableSyms.emplace_back(BD->getSymbol(), Addend);
        continue;
      }
    }
    // Give up if we can't find the vtable for a method.
    DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for "
                            << SymTargets[I].first->getName() << "\n");
    return MethodInfoType();
  }

  // Make sure the vtable reg is not clobbered by the argument passing code
  if (VtableReg != MethodReg) {
    for (MCInst *CurInst = MethodFetchInsns.front(); CurInst < &Inst;
         ++CurInst) {
      const MCInstrDesc &InstrInfo = BC.MII->get(CurInst->getOpcode());
      if (InstrInfo.hasDefOfPhysReg(*CurInst, VtableReg, *BC.MRI)) {
        return MethodInfoType();
      }
    }
  }

  return MethodInfoType(VtableSyms, MethodFetchInsns);
}

std::vector<std::unique_ptr<BinaryBasicBlock>>
IndirectCallPromotion::rewriteCall(
    BinaryBasicBlock &IndCallBlock, const MCInst &CallInst,
    MCPlusBuilder::BlocksVectorTy &&ICPcode,
    const std::vector<MCInst *> &MethodFetchInsns) const {
  BinaryFunction &Function = *IndCallBlock.getFunction();
  MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();

  // Create new basic blocks with correct code in each one first.
  std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs;
  const bool IsTailCallOrJT =
      (MIB->isTailCall(CallInst) || Function.getJumpTable(CallInst));

  // Move instructions from the tail of the original call block
  // to the merge block.

  // Remember any pseudo instructions following a tail call.  These
  // must be preserved and moved to the original block.
  InstructionListType TailInsts;
  const MCInst *TailInst = &CallInst;
  if (IsTailCallOrJT) {
    while (TailInst + 1 < &(*IndCallBlock.end()) &&
           MIB->isPseudo(*(TailInst + 1))) {
      TailInsts.push_back(*++TailInst);
    }
  }

  InstructionListType MovedInst = IndCallBlock.splitInstructions(&CallInst);
  // Link new BBs to the original input offset of the BB where the indirect
  // call site is, so we can map samples recorded in new BBs back to the
  // original BB seen in the input binary (if using BAT)
  const uint32_t OrigOffset = IndCallBlock.getInputOffset();

  IndCallBlock.eraseInstructions(MethodFetchInsns.begin(),
                                 MethodFetchInsns.end());
  if (IndCallBlock.empty() ||
      (!MethodFetchInsns.empty() && MethodFetchInsns.back() == &CallInst)) {
    IndCallBlock.addInstructions(ICPcode.front().second.begin(),
                                 ICPcode.front().second.end());
  } else {
    IndCallBlock.replaceInstruction(std::prev(IndCallBlock.end()),
                                    ICPcode.front().second);
  }
  IndCallBlock.addInstructions(TailInsts.begin(), TailInsts.end());

  for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) {
    MCSymbol *&Sym = Itr->first;
    InstructionListType &Insts = Itr->second;
    assert(Sym);
    std::unique_ptr<BinaryBasicBlock> TBB =
        Function.createBasicBlock(OrigOffset, Sym);
    for (MCInst &Inst : Insts) { // sanitize new instructions.
      if (MIB->isCall(Inst))
        MIB->removeAnnotation(Inst, "CallProfile");
    }
    TBB->addInstructions(Insts.begin(), Insts.end());
    NewBBs.emplace_back(std::move(TBB));
  }

  // Move tail of instructions from after the original call to
  // the merge block.
  if (!IsTailCallOrJT) {
    NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end());
  }

  return NewBBs;
}

BinaryBasicBlock *
IndirectCallPromotion::fixCFG(BinaryBasicBlock &IndCallBlock,
                              const bool IsTailCall, const bool IsJumpTable,
                              IndirectCallPromotion::BasicBlocksVector &&NewBBs,
                              const std::vector<Callsite> &Targets) const {
  BinaryFunction &Function = *IndCallBlock.getFunction();
  using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo;
  BinaryBasicBlock *MergeBlock = nullptr;

  // Scale indirect call counts to the execution count of the original
  // basic block containing the indirect call.
  uint64_t TotalCount = IndCallBlock.getKnownExecutionCount();
  uint64_t TotalIndirectBranches = 0;
  for (const Callsite &Target : Targets) {
    TotalIndirectBranches += Target.Branches;
  }
  if (TotalIndirectBranches == 0)
    TotalIndirectBranches = 1;
  BinaryBasicBlock::BranchInfoType BBI;
  BinaryBasicBlock::BranchInfoType ScaledBBI;
  for (const Callsite &Target : Targets) {
    const size_t NumEntries =
        std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size());
    for (size_t I = 0; I < NumEntries; ++I) {
      BBI.push_back(
          BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries,
                           (Target.Mispreds + NumEntries - 1)  / NumEntries});
      ScaledBBI.push_back(BinaryBranchInfo{
          uint64_t(TotalCount * Target.Branches /
                                      (NumEntries * TotalIndirectBranches)),
          uint64_t(TotalCount * Target.Mispreds /
                                      (NumEntries * TotalIndirectBranches))});
    }
  }

  if (IsJumpTable) {
    BinaryBasicBlock *NewIndCallBlock = NewBBs.back().get();
    IndCallBlock.moveAllSuccessorsTo(NewIndCallBlock);

    std::vector<MCSymbol*> SymTargets;
    for (const Callsite &Target : Targets) {
      const size_t NumEntries =
          std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size());
      for (size_t I = 0; I < NumEntries; ++I) {
        SymTargets.push_back(Target.To.Sym);
      }
    }
    assert(SymTargets.size() > NewBBs.size() - 1 &&
           "There must be a target symbol associated with each new BB.");

    for (uint64_t I = 0; I < NewBBs.size(); ++I) {
      BinaryBasicBlock *SourceBB = I ? NewBBs[I - 1].get() : &IndCallBlock;
      SourceBB->setExecutionCount(TotalCount);

      BinaryBasicBlock *TargetBB =
          Function.getBasicBlockForLabel(SymTargets[I]);
      SourceBB->addSuccessor(TargetBB, ScaledBBI[I]); // taken

      TotalCount -= ScaledBBI[I].Count;
      SourceBB->addSuccessor(NewBBs[I].get(), TotalCount); // fall-through

      // Update branch info for the indirect jump.
      BinaryBasicBlock::BinaryBranchInfo &BranchInfo =
          NewIndCallBlock->getBranchInfo(*TargetBB);
      if (BranchInfo.Count > BBI[I].Count)
        BranchInfo.Count -= BBI[I].Count;
      else
        BranchInfo.Count = 0;

      if (BranchInfo.MispredictedCount > BBI[I].MispredictedCount)
        BranchInfo.MispredictedCount -= BBI[I].MispredictedCount;
      else
        BranchInfo.MispredictedCount = 0;
    }
  } else {
    assert(NewBBs.size() >= 2);
    assert(NewBBs.size() % 2 == 1 || IndCallBlock.succ_empty());
    assert(NewBBs.size() % 2 == 1 || IsTailCall);

    auto ScaledBI = ScaledBBI.begin();
    auto updateCurrentBranchInfo = [&]{
      assert(ScaledBI != ScaledBBI.end());
      TotalCount -= ScaledBI->Count;
      ++ScaledBI;
    };

    if (!IsTailCall) {
      MergeBlock = NewBBs.back().get();
      IndCallBlock.moveAllSuccessorsTo(MergeBlock);
    }

    // Fix up successors and execution counts.
    updateCurrentBranchInfo();
    IndCallBlock.addSuccessor(NewBBs[1].get(), TotalCount);
    IndCallBlock.addSuccessor(NewBBs[0].get(), ScaledBBI[0]);

    const size_t Adj = IsTailCall ? 1 : 2;
    for (size_t I = 0; I < NewBBs.size() - Adj; ++I) {
      assert(TotalCount <= IndCallBlock.getExecutionCount() ||
             TotalCount <= uint64_t(TotalIndirectBranches));
      uint64_t ExecCount = ScaledBBI[(I + 1) / 2].Count;
      if (I % 2 == 0) {
        if (MergeBlock) {
          NewBBs[I]->addSuccessor(MergeBlock, ScaledBBI[(I+1)/2].Count);
        }
      } else {
        assert(I + 2 < NewBBs.size());
        updateCurrentBranchInfo();
        NewBBs[I]->addSuccessor(NewBBs[I+2].get(), TotalCount);
        NewBBs[I]->addSuccessor(NewBBs[I+1].get(), ScaledBBI[(I+1)/2]);
        ExecCount += TotalCount;
      }
      NewBBs[I]->setExecutionCount(ExecCount);
    }

    if (MergeBlock) {
      // Arrange for the MergeBlock to be the fallthrough for the first
      // promoted call block.
      std::unique_ptr<BinaryBasicBlock> MBPtr;
      std::swap(MBPtr, NewBBs.back());
      NewBBs.pop_back();
      NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr));
      // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here?
      NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch
    }
  }

  // Update the execution count.
  NewBBs.back()->setExecutionCount(TotalCount);

  // Update BB and BB layout.
  Function.insertBasicBlocks(&IndCallBlock, std::move(NewBBs));
  assert(Function.validateCFG());

  return MergeBlock;
}

size_t IndirectCallPromotion::canPromoteCallsite(
    const BinaryBasicBlock &BB, const MCInst &Inst,
    const std::vector<Callsite> &Targets, uint64_t NumCalls) {
  if (BB.getKnownExecutionCount() < opts::ExecutionCountThreshold)
    return 0;

  const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst);

  auto computeStats = [&](size_t N) {
    for (size_t I = 0; I < N; ++I) {
      if (!IsJumpTable)
        TotalNumFrequentCalls += Targets[I].Branches;
      else
        TotalNumFrequentJmps += Targets[I].Branches;
    }
  };

  // If we have no targets (or no calls), skip this callsite.
  if (Targets.empty() || !NumCalls) {
    if (opts::Verbosity >= 1) {
      const auto InstIdx = &Inst - &(*BB.begin());
      outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " @ "
             << InstIdx << " in " << BB.getName() << ", calls = " << NumCalls
             << ", targets empty or NumCalls == 0.\n";
    }
    return 0;
  }

  size_t TopN = opts::IndirectCallPromotionTopN;
  if (IsJumpTable) {
    if (opts::IndirectCallPromotionJumpTablesTopN != 0)
      TopN = opts::IndirectCallPromotionJumpTablesTopN;
  } else if (opts::IndirectCallPromotionCallsTopN != 0) {
    TopN = opts::IndirectCallPromotionCallsTopN;
  }
  const size_t TrialN = TopN ? std::min(TopN, Targets.size()) : Targets.size();

  if (opts::ICPTopCallsites > 0) {
    BinaryContext &BC = BB.getFunction()->getBinaryContext();
    if (!BC.MIB->hasAnnotation(Inst, "DoICP"))
      return 0;
  }

  // Pick the top N targets.
  uint64_t TotalCallsTopN = 0;
  uint64_t TotalMispredictsTopN = 0;
  size_t N = 0;

  if (opts::IndirectCallPromotionUseMispredicts &&
      (!IsJumpTable || opts::ICPJumpTablesByTarget)) {
    // Count total number of mispredictions for (at most) the top N targets.
    // We may choose a smaller N (TrialN vs. N) if the frequency threshold
    // is exceeded by fewer targets.
    double Threshold = double(opts::IndirectCallPromotionMispredictThreshold);
    for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) {
      Threshold -= (100.0 * Targets[I].Mispreds) / NumCalls;
      TotalMispredictsTopN += Targets[I].Mispreds;
    }
    computeStats(N);

    // Compute the misprediction frequency of the top N call targets.  If this
    // frequency is greater than the threshold, we should try ICP on this
    // callsite.
    const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls;
    if (TopNFrequency == 0 ||
        TopNFrequency < opts::IndirectCallPromotionMispredictThreshold) {
      if (opts::Verbosity >= 1) {
        const auto InstIdx = &Inst - &(*BB.begin());
        outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " @ "
               << InstIdx << " in " << BB.getName() << ", calls = " << NumCalls
               << ", top N mis. frequency " << format("%.1f", TopNFrequency)
               << "% < " << opts::IndirectCallPromotionMispredictThreshold
               << "%\n";
      }
      return 0;
    }
  } else {
    size_t MaxTargets = 0;

    // Count total number of calls for (at most) the top N targets.
    // We may choose a smaller N (TrialN vs. N) if the frequency threshold
    // is exceeded by fewer targets.
    const unsigned TotalThreshold = IsJumpTable
                                        ? opts::ICPJTTotalPercentThreshold
                                        : opts::ICPCallsTotalPercentThreshold;
    const unsigned RemainingThreshold =
        IsJumpTable ? opts::ICPJTRemainingPercentThreshold
                    : opts::ICPCallsRemainingPercentThreshold;
    uint64_t NumRemainingCalls = NumCalls;
    for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) {
      if (100 * Targets[I].Branches < NumCalls * TotalThreshold)
        break;
      if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold)
        break;
      if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) >
          TrialN)
        break;
      TotalCallsTopN += Targets[I].Branches;
      TotalMispredictsTopN += Targets[I].Mispreds;
      NumRemainingCalls -= Targets[I].Branches;
      N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size();
    }
    computeStats(MaxTargets);

    // Don't check misprediction frequency for jump tables -- we don't really
    // care as long as we are saving loads from the jump table.
    if (!IsJumpTable || opts::ICPJumpTablesByTarget) {
      // Compute the misprediction frequency of the top N call targets.  If
      // this frequency is less than the threshold, we should skip ICP at
      // this callsite.
      const double TopNMispredictFrequency =
        (100.0 * TotalMispredictsTopN) / NumCalls;

      if (TopNMispredictFrequency <
          opts::IndirectCallPromotionMispredictThreshold) {
        if (opts::Verbosity >= 1) {
          const auto InstIdx = &Inst - &(*BB.begin());
          outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " @ "
                 << InstIdx << " in " << BB.getName()
                 << ", calls = " << NumCalls << ", top N mispredict frequency "
                 << format("%.1f", TopNMispredictFrequency) << "% < "
                 << opts::IndirectCallPromotionMispredictThreshold << "%\n";
        }
        return 0;
      }
    }
  }

  // Filter functions that can have ICP applied (for debugging)
  if (!opts::ICPFuncsList.empty()) {
    for (std::string &Name : opts::ICPFuncsList) {
      if (BB.getFunction()->hasName(Name))
        return N;
    }
    return 0;
  }

  return N;
}

void IndirectCallPromotion::printCallsiteInfo(
    const BinaryBasicBlock &BB, const MCInst &Inst,
    const std::vector<Callsite> &Targets, const size_t N,
    uint64_t NumCalls) const {
  BinaryContext &BC = BB.getFunction()->getBinaryContext();
  const bool IsTailCall = BC.MIB->isTailCall(Inst);
  const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst);
  const auto InstIdx = &Inst - &(*BB.begin());

  outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction()
         << " @ " << InstIdx << " in " << BB.getName()
         << " -> calls = " << NumCalls
         << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : ""))
         << "\n";
  for (size_t I = 0; I < N; I++) {
    const double Frequency = 100.0 * Targets[I].Branches / NumCalls;
    const double MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls;
    outs() << "BOLT-INFO:   ";
    if (Targets[I].To.Sym)
      outs() << Targets[I].To.Sym->getName();
    else
      outs() << Targets[I].To.Addr;
    outs() << ", calls = " << Targets[I].Branches
           << ", mispreds = " << Targets[I].Mispreds
           << ", taken freq = " << format("%.1f", Frequency) << "%"
           << ", mis. freq = " << format("%.1f", MisFrequency) << "%";
    bool First = true;
    for (uint64_t JTIndex : Targets[I].JTIndices) {
      outs() << (First ? ", indices = " : ", ") << JTIndex;
      First = false;
    }
    outs() << "\n";
  }

  LLVM_DEBUG({
    dbgs() << "BOLT-INFO: ICP original call instruction:";
    BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true);
  });
}

void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
  if (opts::IndirectCallPromotion == ICP_NONE)
    return;

  auto &BFs = BC.getBinaryFunctions();

  const bool OptimizeCalls =
    (opts::IndirectCallPromotion == ICP_CALLS ||
     opts::IndirectCallPromotion == ICP_ALL);
  const bool OptimizeJumpTables =
    (opts::IndirectCallPromotion == ICP_JUMP_TABLES ||
     opts::IndirectCallPromotion == ICP_ALL);

  std::unique_ptr<RegAnalysis> RA;
  std::unique_ptr<BinaryFunctionCallGraph> CG;
  if (OptimizeJumpTables) {
    CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
    RA.reset(new RegAnalysis(BC, &BFs, &*CG));
  }

  // If icp-top-callsites is enabled, compute the total number of indirect
  // calls and then optimize the hottest callsites that contribute to that
  // total.
  SetVector<BinaryFunction *> Functions;
  if (opts::ICPTopCallsites == 0) {
    for (auto &KV : BFs) {
      Functions.insert(&KV.second);
    }
  } else {
    using IndirectCallsite = std::tuple<uint64_t, MCInst *, BinaryFunction *>;
    std::vector<IndirectCallsite> IndirectCalls;
    size_t TotalIndirectCalls = 0;

    // Find all the indirect callsites.
    for (auto &BFIt : BFs) {
      BinaryFunction &Function = BFIt.second;

      if (!Function.isSimple() || Function.isIgnored() ||
          !Function.hasProfile())
        continue;

      const bool HasLayout = !Function.layout_empty();

      for (BinaryBasicBlock &BB : Function) {
        if (HasLayout && Function.isSplit() && BB.isCold())
          continue;

        for (MCInst &Inst : BB) {
          const bool IsJumpTable = Function.getJumpTable(Inst);
          const bool HasIndirectCallProfile =
            BC.MIB->hasAnnotation(Inst, "CallProfile");
          const bool IsDirectCall = (BC.MIB->isCall(Inst) &&
                                     BC.MIB->getTargetSymbol(Inst, 0));

          if (!IsDirectCall &&
              ((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) ||
               (IsJumpTable && OptimizeJumpTables))) {
            uint64_t NumCalls = 0;
            for (const Callsite &BInfo : getCallTargets(BB, Inst)) {
              NumCalls += BInfo.Branches;
            }
            IndirectCalls.push_back(
                std::make_tuple(NumCalls, &Inst, &Function));
            TotalIndirectCalls += NumCalls;
          }
        }
      }
    }

    // Sort callsites by execution count.
    std::sort(IndirectCalls.rbegin(), IndirectCalls.rend());

    // Find callsites that contribute to the top "opts::ICPTopCallsites"%
    // number of calls.
    const float TopPerc = opts::ICPTopCallsites / 100.0f;
    int64_t MaxCalls = TotalIndirectCalls * TopPerc;
    uint64_t LastFreq = std::numeric_limits<uint64_t>::max();
    size_t Num = 0;
    for (const IndirectCallsite &IC : IndirectCalls) {
      const uint64_t CurFreq = std::get<0>(IC);
      // Once we decide to stop, include at least all branches that share the
      // same frequency of the last one to avoid non-deterministic behavior
      // (e.g. turning on/off ICP depending on the order of functions)
      if (MaxCalls <= 0 && CurFreq != LastFreq)
        break;
      MaxCalls -= CurFreq;
      LastFreq = CurFreq;
      BC.MIB->addAnnotation(*std::get<1>(IC), "DoICP", true);
      Functions.insert(std::get<2>(IC));
      ++Num;
    }
    outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
           << ", " << Num << " callsites cover " << opts::ICPTopCallsites
           << "% of all indirect calls\n";
  }

  for (BinaryFunction *FuncPtr : Functions) {
    BinaryFunction &Function = *FuncPtr;

    if (!Function.isSimple() || Function.isIgnored() || !Function.hasProfile())
      continue;

    const bool HasLayout = !Function.layout_empty();

    // Total number of indirect calls issued from the current Function.
    // (a fraction of TotalIndirectCalls)
    uint64_t FuncTotalIndirectCalls = 0;
    uint64_t FuncTotalIndirectJmps = 0;

    std::vector<BinaryBasicBlock *> BBs;
    for (BinaryBasicBlock &BB : Function) {
      // Skip indirect calls in cold blocks.
      if (!HasLayout || !Function.isSplit() || !BB.isCold()) {
        BBs.push_back(&BB);
      }
    }
    if (BBs.empty())
      continue;

    DataflowInfoManager Info(Function, RA.get(), nullptr);
    while (!BBs.empty()) {
      BinaryBasicBlock *BB = BBs.back();
      BBs.pop_back();

      for (unsigned Idx = 0; Idx < BB->size(); ++Idx) {
        MCInst &Inst = BB->getInstructionAtIndex(Idx);
        const auto InstIdx = &Inst - &(*BB->begin());
        const bool IsTailCall = BC.MIB->isTailCall(Inst);
        const bool HasIndirectCallProfile =
          BC.MIB->hasAnnotation(Inst, "CallProfile");
        const bool IsJumpTable = Function.getJumpTable(Inst);

        if (BC.MIB->isCall(Inst)) {
          TotalCalls += BB->getKnownExecutionCount();
        }

        if (IsJumpTable && !OptimizeJumpTables)
          continue;

        if (!IsJumpTable && (!HasIndirectCallProfile || !OptimizeCalls))
          continue;

        // Ignore direct calls.
        if (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0))
          continue;

        assert((BC.MIB->isCall(Inst) || BC.MIB->isIndirectBranch(Inst))
               && "expected a call or an indirect jump instruction");

        if (IsJumpTable)
          ++TotalJumpTableCallsites;
        else
          ++TotalIndirectCallsites;

        std::vector<Callsite> Targets = getCallTargets(*BB, Inst);

        // Compute the total number of calls from this particular callsite.
        uint64_t NumCalls = 0;
        for (const Callsite &BInfo : Targets) {
          NumCalls += BInfo.Branches;
        }
        if (!IsJumpTable)
          FuncTotalIndirectCalls += NumCalls;
        else
          FuncTotalIndirectJmps += NumCalls;

        // If FLAGS regs is alive after this jmp site, do not try
        // promoting because we will clobber FLAGS.
        if (IsJumpTable) {
          ErrorOr<const BitVector &> State =
              Info.getLivenessAnalysis().getStateBefore(Inst);
          if (!State || (State && (*State)[BC.MIB->getFlagsReg()])) {
            if (opts::Verbosity >= 1) {
              outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
                     << InstIdx << " in " << BB->getName()
                     << ", calls = " << NumCalls
                     << (State ? ", cannot clobber flags reg.\n"
                               : ", no liveness data available.\n");
            }
            continue;
          }
        }

        // Should this callsite be optimized?  Return the number of targets
        // to use when promoting this call.  A value of zero means to skip
        // this callsite.
        size_t N = canPromoteCallsite(*BB, Inst, Targets, NumCalls);

        // If it is a jump table and it failed to meet our initial threshold,
        // proceed to findCallTargetSymbols -- it may reevaluate N if
        // memory profile is present
        if (!N && !IsJumpTable)
          continue;

        if (opts::Verbosity >= 1) {
          printCallsiteInfo(*BB, Inst, Targets, N, NumCalls);
        }

        // Find MCSymbols or absolute addresses for each call target.
        MCInst *TargetFetchInst = nullptr;
        const SymTargetsType SymTargets =
            findCallTargetSymbols(Targets, N, *BB, Inst, TargetFetchInst);

        // findCallTargetSymbols may have changed N if mem profile is available
        // for jump tables
        if (!N)
          continue;

        LLVM_DEBUG(printDecision(dbgs(), Targets, N));

        // If we can't resolve any of the target symbols, punt on this callsite.
        // TODO: can this ever happen?
        if (SymTargets.size() < N) {
          const size_t LastTarget = SymTargets.size();
          if (opts::Verbosity >= 1) {
            outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
                   << InstIdx << " in " << BB->getName()
                   << ", calls = " << NumCalls
                   << ", ICP failed to find target symbol for "
                   << Targets[LastTarget].To.Sym->getName() << "\n";
          }
          continue;
        }

        MethodInfoType MethodInfo;

        if (!IsJumpTable) {
          MethodInfo = maybeGetVtableSyms(*BB, Inst, SymTargets);
          TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1;
          LLVM_DEBUG(dbgs()
                     << "BOLT-INFO: ICP "
                     << (!MethodInfo.first.empty() ? "found" : "did not find")
                     << " vtables for all methods.\n");
        } else if (TargetFetchInst) {
          ++TotalIndexBasedJumps;
          MethodInfo.second.push_back(TargetFetchInst);
        }

        // Generate new promoted call code for this callsite.
        MCPlusBuilder::BlocksVectorTy ICPcode =
            (IsJumpTable && !opts::ICPJumpTablesByTarget)
                ? BC.MIB->jumpTablePromotion(Inst, SymTargets,
                                             MethodInfo.second, BC.Ctx.get())
                : BC.MIB->indirectCallPromotion(
                      Inst, SymTargets, MethodInfo.first, MethodInfo.second,
                      opts::ICPOldCodeSequence, BC.Ctx.get());

        if (ICPcode.empty()) {
          if (opts::Verbosity >= 1) {
            outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
                   << InstIdx << " in " << BB->getName()
                   << ", calls = " << NumCalls
                   << ", unable to generate promoted call code.\n";
          }
          continue;
        }

        LLVM_DEBUG({
          uint64_t Offset = Targets[0].From.Addr;
          dbgs() << "BOLT-INFO: ICP indirect call code:\n";
          for (const auto &entry : ICPcode) {
            const MCSymbol *const &Sym = entry.first;
            const InstructionListType &Insts = entry.second;
            if (Sym) dbgs() << Sym->getName() << ":\n";
            Offset = BC.printInstructions(dbgs(),
                                          Insts.begin(),
                                          Insts.end(),
                                          Offset);
          }
          dbgs() << "---------------------------------------------------\n";
        });

        // Rewrite the CFG with the newly generated ICP code.
        std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs =
            rewriteCall(*BB, Inst, std::move(ICPcode), MethodInfo.second);

        // Fix the CFG after inserting the new basic blocks.
        BinaryBasicBlock *MergeBlock =
            fixCFG(*BB, IsTailCall, IsJumpTable, std::move(NewBBs), Targets);

        // Since the tail of the original block was split off and it may contain
        // additional indirect calls, we must add the merge block to the set of
        // blocks to process.
        if (MergeBlock) {
          BBs.push_back(MergeBlock);
        }

        if (opts::Verbosity >= 1) {
          outs() << "BOLT-INFO: ICP succeeded in "
                 << Function << " @ " << InstIdx
                 << " in " << BB->getName()
                 << " -> calls = " << NumCalls << "\n";
        }

        if (IsJumpTable)
          ++TotalOptimizedJumpTableCallsites;
        else
          ++TotalOptimizedIndirectCallsites;

        Modified.insert(&Function);
      }
    }
    TotalIndirectCalls += FuncTotalIndirectCalls;
    TotalIndirectJmps += FuncTotalIndirectJmps;
  }

  outs() << "BOLT-INFO: ICP total indirect callsites with profile = "
         << TotalIndirectCallsites
         << "\n"
         << "BOLT-INFO: ICP total jump table callsites = "
         << TotalJumpTableCallsites
         << "\n"
         << "BOLT-INFO: ICP total number of calls = "
         << TotalCalls
         << "\n"
         << "BOLT-INFO: ICP percentage of calls that are indirect = "
         << format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls)
         << "%\n"
         << "BOLT-INFO: ICP percentage of indirect calls that can be "
            "optimized = "
         << format("%.1f", (100.0 * TotalNumFrequentCalls) /
                   std::max<size_t>(TotalIndirectCalls, 1))
         << "%\n"
         << "BOLT-INFO: ICP percentage of indirect callsites that are "
            "optimized = "
         << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
                   std::max<uint64_t>(TotalIndirectCallsites, 1))
         << "%\n"
         << "BOLT-INFO: ICP number of method load elimination candidates = "
         << TotalMethodLoadEliminationCandidates
         << "\n"
         << "BOLT-INFO: ICP percentage of method calls candidates that have "
            "loads eliminated = "
         << format("%.1f", (100.0 * TotalMethodLoadsEliminated) /
                   std::max<uint64_t>(TotalMethodLoadEliminationCandidates, 1))
         << "%\n"
         << "BOLT-INFO: ICP percentage of indirect branches that are "
            "optimized = "
         << format("%.1f", (100.0 * TotalNumFrequentJmps) /
                   std::max<uint64_t>(TotalIndirectJmps, 1))
         << "%\n"
         << "BOLT-INFO: ICP percentage of jump table callsites that are "
         << "optimized = "
         << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) /
                   std::max<uint64_t>(TotalJumpTableCallsites, 1))
         << "%\n"
         << "BOLT-INFO: ICP number of jump table callsites that can use hot "
         << "indices = " << TotalIndexBasedCandidates
         << "\n"
         << "BOLT-INFO: ICP percentage of jump table callsites that use hot "
            "indices = "
         << format("%.1f", (100.0 * TotalIndexBasedJumps) /
                   std::max<uint64_t>(TotalIndexBasedCandidates, 1))
         << "%\n";

  (void)verifyProfile;
#ifndef NDEBUG
  verifyProfile(BFs);
#endif
}

} // namespace bolt
} // namespace llvm