[BOLT] Decoder cache friendly alignment wrt Intel JCC Erratum

Summary:
This diff ports reviews.llvm.org/D70157 to our LLVM tree, which
makes the integrated assembler able to align X86 control-flow changing
instructions in a way to reduce the performance impact of the ucode
update on Intel processors that implement the JCC erratum mitigation.

See white paper "Mitigations for Jump Conditional Code Erratum" by Intel
published November 2019.

To port this patch, I changed classifySecondInstInMacroFusion to analyze
instruction opcodes directly instead of analyzing the CondCond operand
(in more recent versions of LLVM, all conditional branches share the
same opcode, but with a different conditional operand). I also pulled to
our tree Alignment.h as a dependency, and the macroop analyzing helpers.

x86-align-branch-boundary and -x86-align-branch are the two flags that
control nop insertion to avoid disabling the decoder cache, following
the original patch. In BOLT, I added the flag
x86-align-branch-boundary-hot-only to request the alignment to only be
applied to hot code, which is turned on by default. The reason is
because such alignment  is expensive to perform on large modules, but if
we limit it to hot code, the relaxation pass runtime becomes tolerable.

(cherry picked from FBD19828850)
This commit is contained in:
Rafael Auler 2020-02-10 18:50:53 -08:00 committed by Maksim Panchenko
parent d5b8fc8fbe
commit c82e7fd1cc
2 changed files with 1365 additions and 14 deletions

File diff suppressed because it is too large Load Diff

View File

@ -81,6 +81,9 @@ using namespace llvm;
using namespace object;
using namespace bolt;
extern cl::opt<uint32_t> X86AlignBranchBoundary;
extern cl::opt<bool> X86AlignBranchWithin32BBoundaries;
namespace opts {
extern bool HeatmapMode;
@ -432,6 +435,12 @@ WriteBoltInfoSection("bolt-info",
cl::Hidden,
cl::cat(BoltOutputCategory));
static cl::opt<bool>
X86AlignBranchBoundaryHotOnly("x86-align-branch-boundary-hot-only",
cl::desc("only apply branch boundary alignment in hot code"),
cl::init(true),
cl::cat(BoltOptCategory));
bool isHotTextMover(const BinaryFunction &Function) {
for (auto &SectionName : opts::HotTextMoveSections) {
if (Function.getOriginSectionName() == SectionName)
@ -1765,6 +1774,18 @@ void RewriteInstance::adjustCommandLineOptions() {
opts::AlignMacroOpFusion = MFT_NONE;
}
if ((X86AlignBranchWithin32BBoundaries || X86AlignBranchBoundary != 0) &&
BC->isX86()) {
if (!BC->HasRelocations) {
errs() << "BOLT-ERROR: cannot apply mitigations for Intel JCC erratum in "
"non-relocation mode\n";
exit(1);
}
outs() << "BOLT-WARNING: using mitigation for Intel JCC erratum, layout "
"may take several minutes\n";
opts::AlignMacroOpFusion = MFT_NONE;
}
if (opts::AlignMacroOpFusion != MFT_NONE &&
!BC->HasRelocations) {
outs() << "BOLT-INFO: disabling -align-macro-fusion in non-relocation "
@ -3010,6 +3031,8 @@ void RewriteInstance::updateSDTMarkers() {
void RewriteInstance::emitFunctions(MCStreamer *Streamer) {
auto emit = [&](const std::vector<BinaryFunction *> &Functions) {
const auto HasProfile = BC->NumProfiledFuncs > 0;
const uint32_t OriginalBranchBoundaryAlign = X86AlignBranchBoundary;
for (auto *Function : Functions) {
if (!BC->HasRelocations &&
(!Function->isSimple() || !opts::shouldProcess(*Function)))
@ -3020,10 +3043,19 @@ void RewriteInstance::emitFunctions(MCStreamer *Streamer) {
<< Function->getFunctionNumber() << '\n');
bool Emitted{false};
// Turn off Intel JCC Erratum mitigation for cold code if requested
if (HasProfile && opts::X86AlignBranchBoundaryHotOnly &&
!Function->hasValidProfile())
X86AlignBranchBoundary = 0;
Emitted |= emitFunction(*Streamer, *Function, /*EmitColdPart=*/false);
if (Function->isSplit())
if (Function->isSplit()) {
if (opts::X86AlignBranchBoundaryHotOnly)
X86AlignBranchBoundary = 0;
Emitted |= emitFunction(*Streamer, *Function, /*EmitColdPart=*/true);
}
X86AlignBranchBoundary = OriginalBranchBoundaryAlign;
if (Emitted)
Function->setEmitted(/*KeepCFG=*/opts::PrintCacheMetrics);