2015-12-29 17:06:16 +08:00
|
|
|
//===- lib/MC/MCFragment.cpp - Assembler Fragment Implementation ----------===//
|
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2015-12-29 17:06:16 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2017-06-06 19:49:48 +08:00
|
|
|
#include "llvm/MC/MCFragment.h"
|
2017-02-08 07:02:00 +08:00
|
|
|
#include "llvm/ADT/SmallVector.h"
|
2015-12-29 17:06:16 +08:00
|
|
|
#include "llvm/ADT/StringExtras.h"
|
|
|
|
#include "llvm/ADT/Twine.h"
|
2018-04-30 22:59:11 +08:00
|
|
|
#include "llvm/Config/llvm-config.h"
|
2015-12-29 17:06:16 +08:00
|
|
|
#include "llvm/MC/MCAsmLayout.h"
|
2017-06-06 19:49:48 +08:00
|
|
|
#include "llvm/MC/MCAssembler.h"
|
2015-12-29 17:06:16 +08:00
|
|
|
#include "llvm/MC/MCContext.h"
|
|
|
|
#include "llvm/MC/MCExpr.h"
|
2017-02-08 07:02:00 +08:00
|
|
|
#include "llvm/MC/MCFixup.h"
|
2015-12-29 17:06:16 +08:00
|
|
|
#include "llvm/MC/MCSection.h"
|
|
|
|
#include "llvm/MC/MCSymbol.h"
|
|
|
|
#include "llvm/MC/MCValue.h"
|
2017-02-08 07:02:00 +08:00
|
|
|
#include "llvm/Support/Casting.h"
|
|
|
|
#include "llvm/Support/Compiler.h"
|
2015-12-29 17:06:16 +08:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2017-02-08 07:02:00 +08:00
|
|
|
#include <cassert>
|
|
|
|
#include <cstdint>
|
|
|
|
#include <utility>
|
|
|
|
|
2015-12-29 17:06:16 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2017-02-08 07:02:00 +08:00
|
|
|
MCAsmLayout::MCAsmLayout(MCAssembler &Asm) : Assembler(Asm) {
|
2015-12-29 17:06:16 +08:00
|
|
|
// Compute the section layout order. Virtual sections must go last.
|
|
|
|
for (MCSection &Sec : Asm)
|
|
|
|
if (!Sec.isVirtualSection())
|
|
|
|
SectionOrder.push_back(&Sec);
|
|
|
|
for (MCSection &Sec : Asm)
|
|
|
|
if (Sec.isVirtualSection())
|
|
|
|
SectionOrder.push_back(&Sec);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool MCAsmLayout::isFragmentValid(const MCFragment *F) const {
|
|
|
|
const MCSection *Sec = F->getParent();
|
|
|
|
const MCFragment *LastValid = LastValidFragment.lookup(Sec);
|
|
|
|
if (!LastValid)
|
|
|
|
return false;
|
|
|
|
assert(LastValid->getParent() == Sec);
|
|
|
|
return F->getLayoutOrder() <= LastValid->getLayoutOrder();
|
|
|
|
}
|
|
|
|
|
2020-05-06 02:11:04 +08:00
|
|
|
bool MCAsmLayout::canGetFragmentOffset(const MCFragment *F) const {
|
|
|
|
MCSection *Sec = F->getParent();
|
|
|
|
MCSection::iterator I;
|
|
|
|
if (MCFragment *LastValid = LastValidFragment[Sec]) {
|
|
|
|
// Fragment already valid, offset is available.
|
|
|
|
if (F->getLayoutOrder() <= LastValid->getLayoutOrder())
|
|
|
|
return true;
|
|
|
|
I = ++MCSection::iterator(LastValid);
|
|
|
|
} else
|
|
|
|
I = Sec->begin();
|
|
|
|
|
|
|
|
// A fragment ordered before F is currently being laid out.
|
|
|
|
const MCFragment *FirstInvalidFragment = &*I;
|
|
|
|
if (FirstInvalidFragment->IsBeingLaidOut)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-12-29 17:06:16 +08:00
|
|
|
void MCAsmLayout::invalidateFragmentsFrom(MCFragment *F) {
|
|
|
|
// If this fragment wasn't already valid, we don't need to do anything.
|
|
|
|
if (!isFragmentValid(F))
|
|
|
|
return;
|
|
|
|
|
|
|
|
// Otherwise, reset the last valid fragment to the previous fragment
|
|
|
|
// (if this is the first fragment, it will be NULL).
|
|
|
|
LastValidFragment[F->getParent()] = F->getPrevNode();
|
|
|
|
}
|
|
|
|
|
|
|
|
void MCAsmLayout::ensureValid(const MCFragment *F) const {
|
|
|
|
MCSection *Sec = F->getParent();
|
|
|
|
MCSection::iterator I;
|
|
|
|
if (MCFragment *Cur = LastValidFragment[Sec])
|
|
|
|
I = ++MCSection::iterator(Cur);
|
|
|
|
else
|
|
|
|
I = Sec->begin();
|
|
|
|
|
|
|
|
// Advance the layout position until the fragment is valid.
|
|
|
|
while (!isFragmentValid(F)) {
|
|
|
|
assert(I != Sec->end() && "Layout bookkeeping error");
|
|
|
|
const_cast<MCAsmLayout *>(this)->layoutFragment(&*I);
|
|
|
|
++I;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const {
|
|
|
|
ensureValid(F);
|
|
|
|
assert(F->Offset != ~UINT64_C(0) && "Address not set!");
|
|
|
|
return F->Offset;
|
|
|
|
}
|
|
|
|
|
2017-09-16 04:01:43 +08:00
|
|
|
// Simple getSymbolOffset helper for the non-variable case.
|
2015-12-29 17:06:16 +08:00
|
|
|
static bool getLabelOffset(const MCAsmLayout &Layout, const MCSymbol &S,
|
|
|
|
bool ReportError, uint64_t &Val) {
|
|
|
|
if (!S.getFragment()) {
|
|
|
|
if (ReportError)
|
|
|
|
report_fatal_error("unable to evaluate offset to undefined symbol '" +
|
|
|
|
S.getName() + "'");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
Val = Layout.getFragmentOffset(S.getFragment()) + S.getOffset();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool getSymbolOffsetImpl(const MCAsmLayout &Layout, const MCSymbol &S,
|
|
|
|
bool ReportError, uint64_t &Val) {
|
|
|
|
if (!S.isVariable())
|
|
|
|
return getLabelOffset(Layout, S, ReportError, Val);
|
|
|
|
|
|
|
|
// If SD is a variable, evaluate it.
|
|
|
|
MCValue Target;
|
|
|
|
if (!S.getVariableValue()->evaluateAsValue(Target, Layout))
|
|
|
|
report_fatal_error("unable to evaluate offset for variable '" +
|
|
|
|
S.getName() + "'");
|
|
|
|
|
|
|
|
uint64_t Offset = Target.getConstant();
|
|
|
|
|
|
|
|
const MCSymbolRefExpr *A = Target.getSymA();
|
|
|
|
if (A) {
|
|
|
|
uint64_t ValA;
|
2021-10-21 02:29:43 +08:00
|
|
|
// FIXME: On most platforms, `Target`'s component symbols are labels from
|
|
|
|
// having been simplified during evaluation, but on Mach-O they can be
|
|
|
|
// variables due to PR19203. This, and the line below for `B` can be
|
|
|
|
// restored to call `getLabelOffset` when PR19203 is fixed.
|
|
|
|
if (!getSymbolOffsetImpl(Layout, A->getSymbol(), ReportError, ValA))
|
2015-12-29 17:06:16 +08:00
|
|
|
return false;
|
|
|
|
Offset += ValA;
|
|
|
|
}
|
|
|
|
|
|
|
|
const MCSymbolRefExpr *B = Target.getSymB();
|
|
|
|
if (B) {
|
|
|
|
uint64_t ValB;
|
2021-10-21 02:29:43 +08:00
|
|
|
if (!getSymbolOffsetImpl(Layout, B->getSymbol(), ReportError, ValB))
|
2015-12-29 17:06:16 +08:00
|
|
|
return false;
|
|
|
|
Offset -= ValB;
|
|
|
|
}
|
|
|
|
|
|
|
|
Val = Offset;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool MCAsmLayout::getSymbolOffset(const MCSymbol &S, uint64_t &Val) const {
|
|
|
|
return getSymbolOffsetImpl(*this, S, false, Val);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t MCAsmLayout::getSymbolOffset(const MCSymbol &S) const {
|
|
|
|
uint64_t Val;
|
|
|
|
getSymbolOffsetImpl(*this, S, true, Val);
|
|
|
|
return Val;
|
|
|
|
}
|
|
|
|
|
|
|
|
const MCSymbol *MCAsmLayout::getBaseSymbol(const MCSymbol &Symbol) const {
|
|
|
|
if (!Symbol.isVariable())
|
|
|
|
return &Symbol;
|
|
|
|
|
|
|
|
const MCExpr *Expr = Symbol.getVariableValue();
|
|
|
|
MCValue Value;
|
|
|
|
if (!Expr->evaluateAsValue(Value, *this)) {
|
|
|
|
Assembler.getContext().reportError(
|
2017-01-20 04:06:32 +08:00
|
|
|
Expr->getLoc(), "expression could not be evaluated");
|
2015-12-29 17:06:16 +08:00
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
const MCSymbolRefExpr *RefB = Value.getSymB();
|
|
|
|
if (RefB) {
|
|
|
|
Assembler.getContext().reportError(
|
2017-01-20 04:06:32 +08:00
|
|
|
Expr->getLoc(), Twine("symbol '") + RefB->getSymbol().getName() +
|
2015-12-29 17:06:16 +08:00
|
|
|
"' could not be evaluated in a subtraction expression");
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
const MCSymbolRefExpr *A = Value.getSymA();
|
|
|
|
if (!A)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
const MCSymbol &ASym = A->getSymbol();
|
|
|
|
const MCAssembler &Asm = getAssembler();
|
|
|
|
if (ASym.isCommon()) {
|
2017-01-20 04:06:32 +08:00
|
|
|
Asm.getContext().reportError(Expr->getLoc(),
|
2015-12-29 17:06:16 +08:00
|
|
|
"Common symbol '" + ASym.getName() +
|
|
|
|
"' cannot be used in assignment expr");
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
return &ASym;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t MCAsmLayout::getSectionAddressSize(const MCSection *Sec) const {
|
|
|
|
// The size is the last fragment's end offset.
|
|
|
|
const MCFragment &F = Sec->getFragmentList().back();
|
|
|
|
return getFragmentOffset(&F) + getAssembler().computeFragmentSize(*this, F);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t MCAsmLayout::getSectionFileSize(const MCSection *Sec) const {
|
|
|
|
// Virtual sections have no file size.
|
|
|
|
if (Sec->isVirtualSection())
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
// Otherwise, the file size is the same as the address space size.
|
|
|
|
return getSectionAddressSize(Sec);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t llvm::computeBundlePadding(const MCAssembler &Assembler,
|
2018-06-15 17:48:18 +08:00
|
|
|
const MCEncodedFragment *F,
|
2015-12-29 17:06:16 +08:00
|
|
|
uint64_t FOffset, uint64_t FSize) {
|
|
|
|
uint64_t BundleSize = Assembler.getBundleAlignSize();
|
|
|
|
assert(BundleSize > 0 &&
|
|
|
|
"computeBundlePadding should only be called if bundling is enabled");
|
|
|
|
uint64_t BundleMask = BundleSize - 1;
|
|
|
|
uint64_t OffsetInBundle = FOffset & BundleMask;
|
|
|
|
uint64_t EndOfFragment = OffsetInBundle + FSize;
|
|
|
|
|
|
|
|
// There are two kinds of bundling restrictions:
|
|
|
|
//
|
|
|
|
// 1) For alignToBundleEnd(), add padding to ensure that the fragment will
|
|
|
|
// *end* on a bundle boundary.
|
|
|
|
// 2) Otherwise, check if the fragment would cross a bundle boundary. If it
|
|
|
|
// would, add padding until the end of the bundle so that the fragment
|
|
|
|
// will start in a new one.
|
|
|
|
if (F->alignToBundleEnd()) {
|
|
|
|
// Three possibilities here:
|
|
|
|
//
|
|
|
|
// A) The fragment just happens to end at a bundle boundary, so we're good.
|
|
|
|
// B) The fragment ends before the current bundle boundary: pad it just
|
|
|
|
// enough to reach the boundary.
|
|
|
|
// C) The fragment ends after the current bundle boundary: pad it until it
|
|
|
|
// reaches the end of the next bundle boundary.
|
|
|
|
//
|
|
|
|
// Note: this code could be made shorter with some modulo trickery, but it's
|
|
|
|
// intentionally kept in its more explicit form for simplicity.
|
|
|
|
if (EndOfFragment == BundleSize)
|
|
|
|
return 0;
|
|
|
|
else if (EndOfFragment < BundleSize)
|
|
|
|
return BundleSize - EndOfFragment;
|
|
|
|
else { // EndOfFragment > BundleSize
|
|
|
|
return 2 * BundleSize - EndOfFragment;
|
|
|
|
}
|
|
|
|
} else if (OffsetInBundle > 0 && EndOfFragment > BundleSize)
|
|
|
|
return BundleSize - OffsetInBundle;
|
|
|
|
else
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* *** */
|
|
|
|
|
2016-08-31 02:40:47 +08:00
|
|
|
void ilist_alloc_traits<MCFragment>::deleteNode(MCFragment *V) { V->destroy(); }
|
2015-12-29 17:06:16 +08:00
|
|
|
|
|
|
|
MCFragment::MCFragment(FragmentType Kind, bool HasInstructions,
|
2018-06-15 17:48:18 +08:00
|
|
|
MCSection *Parent)
|
2020-01-06 11:02:59 +08:00
|
|
|
: Parent(Parent), Atom(nullptr), Offset(~UINT64_C(0)), LayoutOrder(0),
|
2020-05-06 02:11:04 +08:00
|
|
|
Kind(Kind), IsBeingLaidOut(false), HasInstructions(HasInstructions) {
|
2020-01-06 10:44:22 +08:00
|
|
|
if (Parent && !isa<MCDummyFragment>(*this))
|
2015-12-29 17:06:16 +08:00
|
|
|
Parent->getFragmentList().push_back(this);
|
|
|
|
}
|
|
|
|
|
|
|
|
void MCFragment::destroy() {
|
|
|
|
// First check if we are the sentinal.
|
|
|
|
if (Kind == FragmentType(~0)) {
|
|
|
|
delete this;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (Kind) {
|
|
|
|
case FT_Align:
|
|
|
|
delete cast<MCAlignFragment>(this);
|
|
|
|
return;
|
|
|
|
case FT_Data:
|
|
|
|
delete cast<MCDataFragment>(this);
|
|
|
|
return;
|
|
|
|
case FT_CompactEncodedInst:
|
|
|
|
delete cast<MCCompactEncodedInstFragment>(this);
|
|
|
|
return;
|
|
|
|
case FT_Fill:
|
|
|
|
delete cast<MCFillFragment>(this);
|
|
|
|
return;
|
2020-07-31 09:33:33 +08:00
|
|
|
case FT_Nops:
|
|
|
|
delete cast<MCNopsFragment>(this);
|
|
|
|
return;
|
2015-12-29 17:06:16 +08:00
|
|
|
case FT_Relaxable:
|
|
|
|
delete cast<MCRelaxableFragment>(this);
|
|
|
|
return;
|
|
|
|
case FT_Org:
|
|
|
|
delete cast<MCOrgFragment>(this);
|
|
|
|
return;
|
|
|
|
case FT_Dwarf:
|
|
|
|
delete cast<MCDwarfLineAddrFragment>(this);
|
|
|
|
return;
|
|
|
|
case FT_DwarfFrame:
|
|
|
|
delete cast<MCDwarfCallFrameFragment>(this);
|
|
|
|
return;
|
|
|
|
case FT_LEB:
|
|
|
|
delete cast<MCLEBFragment>(this);
|
|
|
|
return;
|
Align branches within 32-Byte boundary (NOP padding)
WARNING: If you're looking at this patch because you're looking for a full
performace mitigation of the Intel JCC Erratum, this is not it!
This is a preliminary patch on the patch towards mitigating the performance
regressions caused by Intel's microcode update for Jump Conditional Code
Erratum. For context, see:
https://www.intel.com/content/www/us/en/support/articles/000055650.html
The patch adds the required assembler infrastructure and command line options
needed to exercise the logic for INTERNAL TESTING. These are NOT public flags,
and should not be used for anything other than LLVM's own testing/debugging
purposes. They are likely to change both in spelling and meaning.
WARNING: This patch is knowingly incorrect in some cornercases. We need, and
do not yet provide, a mechanism to selective enable/disable the padding.
Conversation on this will continue in parellel with work on extending this
infrastructure to support prefix padding.
The goal here is to have the assembler align specific instructions such that
they neither cross or end at a 32 byte boundary. The impacted instructions are:
a. Conditional jump.
b. Fused conditional jump.
c. Unconditional jump.
d. Indirect jump.
e. Ret.
f. Call.
The new options for llvm-mc are:
-x86-align-branch-boundary=NUM aligns branches within NUM byte boundary.
-x86-align-branch=TYPE[+TYPE...] specifies types of branches to align.
A new MCFragment type, MCBoundaryAlignFragment, is added, which may emit
NOP to align the fused/unfused branch.
alignBranchesBegin inserts MCBoundaryAlignFragment before instructions,
alignBranchesEnd marks the end of the branch to be aligned,
relaxBoundaryAlign grows or shrinks sizes of NOP to align the target branch.
Nop padding is disabled when the instruction may be rewritten by the linker,
such as TLS Call.
Process Note: I am landing a patch by skan as it has been LGTMed, and
continuing to iterate on the review is simply slowing us down at this point.
We can and will continue to iterate in tree.
Patch By: skan
Differential Revision: https://reviews.llvm.org/D70157
2019-12-21 02:51:05 +08:00
|
|
|
case FT_BoundaryAlign:
|
|
|
|
delete cast<MCBoundaryAlignFragment>(this);
|
|
|
|
return;
|
2017-11-09 02:57:02 +08:00
|
|
|
case FT_SymbolId:
|
|
|
|
delete cast<MCSymbolIdFragment>(this);
|
2015-12-29 17:06:16 +08:00
|
|
|
return;
|
2016-02-03 01:41:18 +08:00
|
|
|
case FT_CVInlineLines:
|
|
|
|
delete cast<MCCVInlineLineTableFragment>(this);
|
|
|
|
return;
|
2016-02-05 09:55:49 +08:00
|
|
|
case FT_CVDefRange:
|
|
|
|
delete cast<MCCVDefRangeFragment>(this);
|
|
|
|
return;
|
[CSSPGO] Pseudo probe encoding and emission.
This change implements pseudo probe encoding and emission for CSSPGO. Please see RFC here for more context: https://groups.google.com/g/llvm-dev/c/1p1rdYbL93s
Pseudo probes are in the form of intrinsic calls on IR/MIR but they do not turn into any machine instructions. Instead they are emitted into the binary as a piece of data in standalone sections. The probe-specific sections are not needed to be loaded into memory at execution time, thus they do not incur a runtime overhead.
**ELF object emission**
The binary data to emit are organized as two ELF sections, i.e, the `.pseudo_probe_desc` section and the `.pseudo_probe` section. The `.pseudo_probe_desc` section stores a function descriptor for each function and the `.pseudo_probe` section stores the actual probes, each fo which corresponds to an IR basic block or an IR function callsite. A function descriptor is stored as a module-level metadata during the compilation and is serialized into the object file during object emission.
Both the probe descriptors and pseudo probes can be emitted into a separate ELF section per function to leverage the linker for deduplication. A `.pseudo_probe` section shares the same COMDAT group with the function code so that when the function is dead, the probes are dead and disposed too. On the contrary, a `.pseudo_probe_desc` section has its own COMDAT group. This is because even if a function is dead, its probes may be inlined into other functions and its descriptor is still needed by the profile generation tool.
The format of `.pseudo_probe_desc` section looks like:
```
.section .pseudo_probe_desc,"",@progbits
.quad 6309742469962978389 // Func GUID
.quad 4294967295 // Func Hash
.byte 9 // Length of func name
.ascii "_Z5funcAi" // Func name
.quad 7102633082150537521
.quad 138828622701
.byte 12
.ascii "_Z8funcLeafi"
.quad 446061515086924981
.quad 4294967295
.byte 9
.ascii "_Z5funcBi"
.quad -2016976694713209516
.quad 72617220756
.byte 7
.ascii "_Z3fibi"
```
For each `.pseudoprobe` section, the encoded binary data consists of a single function record corresponding to an outlined function (i.e, a function with a code entry in the `.text` section). A function record has the following format :
```
FUNCTION BODY (one for each outlined function present in the text section)
GUID (uint64)
GUID of the function
NPROBES (ULEB128)
Number of probes originating from this function.
NUM_INLINED_FUNCTIONS (ULEB128)
Number of callees inlined into this function, aka number of
first-level inlinees
PROBE RECORDS
A list of NPROBES entries. Each entry contains:
INDEX (ULEB128)
TYPE (uint4)
0 - block probe, 1 - indirect call, 2 - direct call
ATTRIBUTE (uint3)
reserved
ADDRESS_TYPE (uint1)
0 - code address, 1 - address delta
CODE_ADDRESS (uint64 or ULEB128)
code address or address delta, depending on ADDRESS_TYPE
INLINED FUNCTION RECORDS
A list of NUM_INLINED_FUNCTIONS entries describing each of the inlined
callees. Each record contains:
INLINE SITE
GUID of the inlinee (uint64)
ID of the callsite probe (ULEB128)
FUNCTION BODY
A FUNCTION BODY entry describing the inlined function.
```
To support building a context-sensitive profile, probes from inlinees are grouped by their inline contexts. An inline context is logically a call path through which a callee function lands in a caller function. The probe emitter builds an inline tree based on the debug metadata for each outlined function in the form of a trie tree. A tree root is the outlined function. Each tree edge stands for a callsite where inlining happens. Pseudo probes originating from an inlinee function are stored in a tree node and the tree path starting from the root all the way down to the tree node is the inline context of the probes. The emission happens on the whole tree top-down recursively. Probes of a tree node will be emitted altogether with their direct parent edge. Since a pseudo probe corresponds to a real code address, for size savings, the address is encoded as a delta from the previous probe except for the first probe. Variant-sized integer encoding, aka LEB128, is used for address delta and probe index.
**Assembling**
Pseudo probes can be printed as assembly directives alternatively. This allows for good assembly code readability and also provides a view of how optimizations and pseudo probes affect each other, especially helpful for diff time assembly analysis.
A pseudo probe directive has the following operands in order: function GUID, probe index, probe type, probe attributes and inline context. The directive is generated by the compiler and can be parsed by the assembler to form an encoded `.pseudoprobe` section in the object file.
A example assembly looks like:
```
foo2: # @foo2
# %bb.0: # %bb0
pushq %rax
testl %edi, %edi
.pseudoprobe 837061429793323041 1 0 0
je .LBB1_1
# %bb.2: # %bb2
.pseudoprobe 837061429793323041 6 2 0
callq foo
.pseudoprobe 837061429793323041 3 0 0
.pseudoprobe 837061429793323041 4 0 0
popq %rax
retq
.LBB1_1: # %bb1
.pseudoprobe 837061429793323041 5 1 0
callq *%rsi
.pseudoprobe 837061429793323041 2 0 0
.pseudoprobe 837061429793323041 4 0 0
popq %rax
retq
# -- End function
.section .pseudo_probe_desc,"",@progbits
.quad 6699318081062747564
.quad 72617220756
.byte 3
.ascii "foo"
.quad 837061429793323041
.quad 281547593931412
.byte 4
.ascii "foo2"
```
With inlining turned on, the assembly may look different around %bb2 with an inlined probe:
```
# %bb.2: # %bb2
.pseudoprobe 837061429793323041 3 0
.pseudoprobe 6699318081062747564 1 0 @ 837061429793323041:6
.pseudoprobe 837061429793323041 4 0
popq %rax
retq
```
**Disassembling**
We have a disassembling tool (llvm-profgen) that can display disassembly alongside with pseudo probes. So far it only supports ELF executable file.
An example disassembly looks like:
```
00000000002011a0 <foo2>:
2011a0: 50 push rax
2011a1: 85 ff test edi,edi
[Probe]: FUNC: foo2 Index: 1 Type: Block
2011a3: 74 02 je 2011a7 <foo2+0x7>
[Probe]: FUNC: foo2 Index: 3 Type: Block
[Probe]: FUNC: foo2 Index: 4 Type: Block
[Probe]: FUNC: foo Index: 1 Type: Block Inlined: @ foo2:6
2011a5: 58 pop rax
2011a6: c3 ret
[Probe]: FUNC: foo2 Index: 2 Type: Block
2011a7: bf 01 00 00 00 mov edi,0x1
[Probe]: FUNC: foo2 Index: 5 Type: IndirectCall
2011ac: ff d6 call rsi
[Probe]: FUNC: foo2 Index: 4 Type: Block
2011ae: 58 pop rax
2011af: c3 ret
```
Reviewed By: wmi
Differential Revision: https://reviews.llvm.org/D91878
2020-12-09 07:37:32 +08:00
|
|
|
case FT_PseudoProbe:
|
|
|
|
delete cast<MCPseudoProbeAddrFragment>(this);
|
|
|
|
return;
|
2015-12-29 17:06:16 +08:00
|
|
|
case FT_Dummy:
|
|
|
|
delete cast<MCDummyFragment>(this);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Debugging methods
|
|
|
|
|
|
|
|
namespace llvm {
|
|
|
|
|
|
|
|
raw_ostream &operator<<(raw_ostream &OS, const MCFixup &AF) {
|
|
|
|
OS << "<MCFixup" << " Offset:" << AF.getOffset()
|
|
|
|
<< " Value:" << *AF.getValue()
|
|
|
|
<< " Kind:" << AF.getKind() << ">";
|
|
|
|
return OS;
|
|
|
|
}
|
|
|
|
|
2017-02-08 07:02:00 +08:00
|
|
|
} // end namespace llvm
|
2015-12-29 17:06:16 +08:00
|
|
|
|
2017-10-15 22:32:27 +08:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2017-06-22 06:19:17 +08:00
|
|
|
LLVM_DUMP_METHOD void MCFragment::dump() const {
|
2017-02-08 07:02:00 +08:00
|
|
|
raw_ostream &OS = errs();
|
2015-12-29 17:06:16 +08:00
|
|
|
|
|
|
|
OS << "<";
|
|
|
|
switch (getKind()) {
|
|
|
|
case MCFragment::FT_Align: OS << "MCAlignFragment"; break;
|
|
|
|
case MCFragment::FT_Data: OS << "MCDataFragment"; break;
|
|
|
|
case MCFragment::FT_CompactEncodedInst:
|
|
|
|
OS << "MCCompactEncodedInstFragment"; break;
|
|
|
|
case MCFragment::FT_Fill: OS << "MCFillFragment"; break;
|
2020-07-31 09:33:33 +08:00
|
|
|
case MCFragment::FT_Nops:
|
|
|
|
OS << "MCFNopsFragment";
|
|
|
|
break;
|
2015-12-29 17:06:16 +08:00
|
|
|
case MCFragment::FT_Relaxable: OS << "MCRelaxableFragment"; break;
|
|
|
|
case MCFragment::FT_Org: OS << "MCOrgFragment"; break;
|
|
|
|
case MCFragment::FT_Dwarf: OS << "MCDwarfFragment"; break;
|
|
|
|
case MCFragment::FT_DwarfFrame: OS << "MCDwarfCallFrameFragment"; break;
|
|
|
|
case MCFragment::FT_LEB: OS << "MCLEBFragment"; break;
|
Align branches within 32-Byte boundary (NOP padding)
WARNING: If you're looking at this patch because you're looking for a full
performace mitigation of the Intel JCC Erratum, this is not it!
This is a preliminary patch on the patch towards mitigating the performance
regressions caused by Intel's microcode update for Jump Conditional Code
Erratum. For context, see:
https://www.intel.com/content/www/us/en/support/articles/000055650.html
The patch adds the required assembler infrastructure and command line options
needed to exercise the logic for INTERNAL TESTING. These are NOT public flags,
and should not be used for anything other than LLVM's own testing/debugging
purposes. They are likely to change both in spelling and meaning.
WARNING: This patch is knowingly incorrect in some cornercases. We need, and
do not yet provide, a mechanism to selective enable/disable the padding.
Conversation on this will continue in parellel with work on extending this
infrastructure to support prefix padding.
The goal here is to have the assembler align specific instructions such that
they neither cross or end at a 32 byte boundary. The impacted instructions are:
a. Conditional jump.
b. Fused conditional jump.
c. Unconditional jump.
d. Indirect jump.
e. Ret.
f. Call.
The new options for llvm-mc are:
-x86-align-branch-boundary=NUM aligns branches within NUM byte boundary.
-x86-align-branch=TYPE[+TYPE...] specifies types of branches to align.
A new MCFragment type, MCBoundaryAlignFragment, is added, which may emit
NOP to align the fused/unfused branch.
alignBranchesBegin inserts MCBoundaryAlignFragment before instructions,
alignBranchesEnd marks the end of the branch to be aligned,
relaxBoundaryAlign grows or shrinks sizes of NOP to align the target branch.
Nop padding is disabled when the instruction may be rewritten by the linker,
such as TLS Call.
Process Note: I am landing a patch by skan as it has been LGTMed, and
continuing to iterate on the review is simply slowing us down at this point.
We can and will continue to iterate in tree.
Patch By: skan
Differential Revision: https://reviews.llvm.org/D70157
2019-12-21 02:51:05 +08:00
|
|
|
case MCFragment::FT_BoundaryAlign: OS<<"MCBoundaryAlignFragment"; break;
|
2017-11-09 02:57:02 +08:00
|
|
|
case MCFragment::FT_SymbolId: OS << "MCSymbolIdFragment"; break;
|
2016-02-03 01:41:18 +08:00
|
|
|
case MCFragment::FT_CVInlineLines: OS << "MCCVInlineLineTableFragment"; break;
|
2016-02-05 09:55:49 +08:00
|
|
|
case MCFragment::FT_CVDefRange: OS << "MCCVDefRangeTableFragment"; break;
|
[CSSPGO] Pseudo probe encoding and emission.
This change implements pseudo probe encoding and emission for CSSPGO. Please see RFC here for more context: https://groups.google.com/g/llvm-dev/c/1p1rdYbL93s
Pseudo probes are in the form of intrinsic calls on IR/MIR but they do not turn into any machine instructions. Instead they are emitted into the binary as a piece of data in standalone sections. The probe-specific sections are not needed to be loaded into memory at execution time, thus they do not incur a runtime overhead.
**ELF object emission**
The binary data to emit are organized as two ELF sections, i.e, the `.pseudo_probe_desc` section and the `.pseudo_probe` section. The `.pseudo_probe_desc` section stores a function descriptor for each function and the `.pseudo_probe` section stores the actual probes, each fo which corresponds to an IR basic block or an IR function callsite. A function descriptor is stored as a module-level metadata during the compilation and is serialized into the object file during object emission.
Both the probe descriptors and pseudo probes can be emitted into a separate ELF section per function to leverage the linker for deduplication. A `.pseudo_probe` section shares the same COMDAT group with the function code so that when the function is dead, the probes are dead and disposed too. On the contrary, a `.pseudo_probe_desc` section has its own COMDAT group. This is because even if a function is dead, its probes may be inlined into other functions and its descriptor is still needed by the profile generation tool.
The format of `.pseudo_probe_desc` section looks like:
```
.section .pseudo_probe_desc,"",@progbits
.quad 6309742469962978389 // Func GUID
.quad 4294967295 // Func Hash
.byte 9 // Length of func name
.ascii "_Z5funcAi" // Func name
.quad 7102633082150537521
.quad 138828622701
.byte 12
.ascii "_Z8funcLeafi"
.quad 446061515086924981
.quad 4294967295
.byte 9
.ascii "_Z5funcBi"
.quad -2016976694713209516
.quad 72617220756
.byte 7
.ascii "_Z3fibi"
```
For each `.pseudoprobe` section, the encoded binary data consists of a single function record corresponding to an outlined function (i.e, a function with a code entry in the `.text` section). A function record has the following format :
```
FUNCTION BODY (one for each outlined function present in the text section)
GUID (uint64)
GUID of the function
NPROBES (ULEB128)
Number of probes originating from this function.
NUM_INLINED_FUNCTIONS (ULEB128)
Number of callees inlined into this function, aka number of
first-level inlinees
PROBE RECORDS
A list of NPROBES entries. Each entry contains:
INDEX (ULEB128)
TYPE (uint4)
0 - block probe, 1 - indirect call, 2 - direct call
ATTRIBUTE (uint3)
reserved
ADDRESS_TYPE (uint1)
0 - code address, 1 - address delta
CODE_ADDRESS (uint64 or ULEB128)
code address or address delta, depending on ADDRESS_TYPE
INLINED FUNCTION RECORDS
A list of NUM_INLINED_FUNCTIONS entries describing each of the inlined
callees. Each record contains:
INLINE SITE
GUID of the inlinee (uint64)
ID of the callsite probe (ULEB128)
FUNCTION BODY
A FUNCTION BODY entry describing the inlined function.
```
To support building a context-sensitive profile, probes from inlinees are grouped by their inline contexts. An inline context is logically a call path through which a callee function lands in a caller function. The probe emitter builds an inline tree based on the debug metadata for each outlined function in the form of a trie tree. A tree root is the outlined function. Each tree edge stands for a callsite where inlining happens. Pseudo probes originating from an inlinee function are stored in a tree node and the tree path starting from the root all the way down to the tree node is the inline context of the probes. The emission happens on the whole tree top-down recursively. Probes of a tree node will be emitted altogether with their direct parent edge. Since a pseudo probe corresponds to a real code address, for size savings, the address is encoded as a delta from the previous probe except for the first probe. Variant-sized integer encoding, aka LEB128, is used for address delta and probe index.
**Assembling**
Pseudo probes can be printed as assembly directives alternatively. This allows for good assembly code readability and also provides a view of how optimizations and pseudo probes affect each other, especially helpful for diff time assembly analysis.
A pseudo probe directive has the following operands in order: function GUID, probe index, probe type, probe attributes and inline context. The directive is generated by the compiler and can be parsed by the assembler to form an encoded `.pseudoprobe` section in the object file.
A example assembly looks like:
```
foo2: # @foo2
# %bb.0: # %bb0
pushq %rax
testl %edi, %edi
.pseudoprobe 837061429793323041 1 0 0
je .LBB1_1
# %bb.2: # %bb2
.pseudoprobe 837061429793323041 6 2 0
callq foo
.pseudoprobe 837061429793323041 3 0 0
.pseudoprobe 837061429793323041 4 0 0
popq %rax
retq
.LBB1_1: # %bb1
.pseudoprobe 837061429793323041 5 1 0
callq *%rsi
.pseudoprobe 837061429793323041 2 0 0
.pseudoprobe 837061429793323041 4 0 0
popq %rax
retq
# -- End function
.section .pseudo_probe_desc,"",@progbits
.quad 6699318081062747564
.quad 72617220756
.byte 3
.ascii "foo"
.quad 837061429793323041
.quad 281547593931412
.byte 4
.ascii "foo2"
```
With inlining turned on, the assembly may look different around %bb2 with an inlined probe:
```
# %bb.2: # %bb2
.pseudoprobe 837061429793323041 3 0
.pseudoprobe 6699318081062747564 1 0 @ 837061429793323041:6
.pseudoprobe 837061429793323041 4 0
popq %rax
retq
```
**Disassembling**
We have a disassembling tool (llvm-profgen) that can display disassembly alongside with pseudo probes. So far it only supports ELF executable file.
An example disassembly looks like:
```
00000000002011a0 <foo2>:
2011a0: 50 push rax
2011a1: 85 ff test edi,edi
[Probe]: FUNC: foo2 Index: 1 Type: Block
2011a3: 74 02 je 2011a7 <foo2+0x7>
[Probe]: FUNC: foo2 Index: 3 Type: Block
[Probe]: FUNC: foo2 Index: 4 Type: Block
[Probe]: FUNC: foo Index: 1 Type: Block Inlined: @ foo2:6
2011a5: 58 pop rax
2011a6: c3 ret
[Probe]: FUNC: foo2 Index: 2 Type: Block
2011a7: bf 01 00 00 00 mov edi,0x1
[Probe]: FUNC: foo2 Index: 5 Type: IndirectCall
2011ac: ff d6 call rsi
[Probe]: FUNC: foo2 Index: 4 Type: Block
2011ae: 58 pop rax
2011af: c3 ret
```
Reviewed By: wmi
Differential Revision: https://reviews.llvm.org/D91878
2020-12-09 07:37:32 +08:00
|
|
|
case MCFragment::FT_PseudoProbe:
|
|
|
|
OS << "MCPseudoProbe";
|
|
|
|
break;
|
2016-02-03 01:41:18 +08:00
|
|
|
case MCFragment::FT_Dummy: OS << "MCDummyFragment"; break;
|
2015-12-29 17:06:16 +08:00
|
|
|
}
|
|
|
|
|
2018-06-15 17:48:18 +08:00
|
|
|
OS << "<MCFragment " << (const void *)this << " LayoutOrder:" << LayoutOrder
|
|
|
|
<< " Offset:" << Offset << " HasInstructions:" << hasInstructions();
|
2019-12-23 16:29:14 +08:00
|
|
|
if (const auto *EF = dyn_cast<MCEncodedFragment>(this))
|
2018-06-15 17:48:18 +08:00
|
|
|
OS << " BundlePadding:" << static_cast<unsigned>(EF->getBundlePadding());
|
|
|
|
OS << ">";
|
2015-12-29 17:06:16 +08:00
|
|
|
|
|
|
|
switch (getKind()) {
|
|
|
|
case MCFragment::FT_Align: {
|
2019-12-23 16:29:14 +08:00
|
|
|
const auto *AF = cast<MCAlignFragment>(this);
|
2015-12-29 17:06:16 +08:00
|
|
|
if (AF->hasEmitNops())
|
|
|
|
OS << " (emit nops)";
|
|
|
|
OS << "\n ";
|
|
|
|
OS << " Alignment:" << AF->getAlignment()
|
|
|
|
<< " Value:" << AF->getValue() << " ValueSize:" << AF->getValueSize()
|
|
|
|
<< " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case MCFragment::FT_Data: {
|
2019-12-23 16:29:14 +08:00
|
|
|
const auto *DF = cast<MCDataFragment>(this);
|
2015-12-29 17:06:16 +08:00
|
|
|
OS << "\n ";
|
|
|
|
OS << " Contents:[";
|
|
|
|
const SmallVectorImpl<char> &Contents = DF->getContents();
|
|
|
|
for (unsigned i = 0, e = Contents.size(); i != e; ++i) {
|
|
|
|
if (i) OS << ",";
|
|
|
|
OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF);
|
|
|
|
}
|
|
|
|
OS << "] (" << Contents.size() << " bytes)";
|
|
|
|
|
|
|
|
if (DF->fixup_begin() != DF->fixup_end()) {
|
|
|
|
OS << ",\n ";
|
|
|
|
OS << " Fixups:[";
|
|
|
|
for (MCDataFragment::const_fixup_iterator it = DF->fixup_begin(),
|
|
|
|
ie = DF->fixup_end(); it != ie; ++it) {
|
|
|
|
if (it != DF->fixup_begin()) OS << ",\n ";
|
|
|
|
OS << *it;
|
|
|
|
}
|
|
|
|
OS << "]";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case MCFragment::FT_CompactEncodedInst: {
|
2019-12-23 16:29:14 +08:00
|
|
|
const auto *CEIF =
|
2015-12-29 17:06:16 +08:00
|
|
|
cast<MCCompactEncodedInstFragment>(this);
|
|
|
|
OS << "\n ";
|
|
|
|
OS << " Contents:[";
|
|
|
|
const SmallVectorImpl<char> &Contents = CEIF->getContents();
|
|
|
|
for (unsigned i = 0, e = Contents.size(); i != e; ++i) {
|
|
|
|
if (i) OS << ",";
|
|
|
|
OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF);
|
|
|
|
}
|
|
|
|
OS << "] (" << Contents.size() << " bytes)";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case MCFragment::FT_Fill: {
|
2019-12-23 16:29:14 +08:00
|
|
|
const auto *FF = cast<MCFillFragment>(this);
|
2017-06-23 01:57:01 +08:00
|
|
|
OS << " Value:" << static_cast<unsigned>(FF->getValue())
|
2018-05-19 01:45:48 +08:00
|
|
|
<< " ValueSize:" << static_cast<unsigned>(FF->getValueSize())
|
|
|
|
<< " NumValues:" << FF->getNumValues();
|
2015-12-29 17:06:16 +08:00
|
|
|
break;
|
|
|
|
}
|
2020-07-31 09:33:33 +08:00
|
|
|
case MCFragment::FT_Nops: {
|
|
|
|
const auto *NF = cast<MCNopsFragment>(this);
|
|
|
|
OS << " NumBytes:" << NF->getNumBytes()
|
|
|
|
<< " ControlledNopLength:" << NF->getControlledNopLength();
|
|
|
|
break;
|
|
|
|
}
|
2015-12-29 17:06:16 +08:00
|
|
|
case MCFragment::FT_Relaxable: {
|
2019-12-23 16:29:14 +08:00
|
|
|
const auto *F = cast<MCRelaxableFragment>(this);
|
2015-12-29 17:06:16 +08:00
|
|
|
OS << "\n ";
|
|
|
|
OS << " Inst:";
|
|
|
|
F->getInst().dump_pretty(OS);
|
2020-03-14 06:54:43 +08:00
|
|
|
OS << " (" << F->getContents().size() << " bytes)";
|
2015-12-29 17:06:16 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case MCFragment::FT_Org: {
|
2019-12-23 16:29:14 +08:00
|
|
|
const auto *OF = cast<MCOrgFragment>(this);
|
2015-12-29 17:06:16 +08:00
|
|
|
OS << "\n ";
|
2017-06-23 01:57:01 +08:00
|
|
|
OS << " Offset:" << OF->getOffset()
|
|
|
|
<< " Value:" << static_cast<unsigned>(OF->getValue());
|
2015-12-29 17:06:16 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case MCFragment::FT_Dwarf: {
|
2019-12-23 16:29:14 +08:00
|
|
|
const auto *OF = cast<MCDwarfLineAddrFragment>(this);
|
2015-12-29 17:06:16 +08:00
|
|
|
OS << "\n ";
|
|
|
|
OS << " AddrDelta:" << OF->getAddrDelta()
|
|
|
|
<< " LineDelta:" << OF->getLineDelta();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case MCFragment::FT_DwarfFrame: {
|
2019-12-23 16:29:14 +08:00
|
|
|
const auto *CF = cast<MCDwarfCallFrameFragment>(this);
|
2015-12-29 17:06:16 +08:00
|
|
|
OS << "\n ";
|
|
|
|
OS << " AddrDelta:" << CF->getAddrDelta();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case MCFragment::FT_LEB: {
|
2019-12-23 16:29:14 +08:00
|
|
|
const auto *LF = cast<MCLEBFragment>(this);
|
2015-12-29 17:06:16 +08:00
|
|
|
OS << "\n ";
|
|
|
|
OS << " Value:" << LF->getValue() << " Signed:" << LF->isSigned();
|
|
|
|
break;
|
|
|
|
}
|
Align branches within 32-Byte boundary (NOP padding)
WARNING: If you're looking at this patch because you're looking for a full
performace mitigation of the Intel JCC Erratum, this is not it!
This is a preliminary patch on the patch towards mitigating the performance
regressions caused by Intel's microcode update for Jump Conditional Code
Erratum. For context, see:
https://www.intel.com/content/www/us/en/support/articles/000055650.html
The patch adds the required assembler infrastructure and command line options
needed to exercise the logic for INTERNAL TESTING. These are NOT public flags,
and should not be used for anything other than LLVM's own testing/debugging
purposes. They are likely to change both in spelling and meaning.
WARNING: This patch is knowingly incorrect in some cornercases. We need, and
do not yet provide, a mechanism to selective enable/disable the padding.
Conversation on this will continue in parellel with work on extending this
infrastructure to support prefix padding.
The goal here is to have the assembler align specific instructions such that
they neither cross or end at a 32 byte boundary. The impacted instructions are:
a. Conditional jump.
b. Fused conditional jump.
c. Unconditional jump.
d. Indirect jump.
e. Ret.
f. Call.
The new options for llvm-mc are:
-x86-align-branch-boundary=NUM aligns branches within NUM byte boundary.
-x86-align-branch=TYPE[+TYPE...] specifies types of branches to align.
A new MCFragment type, MCBoundaryAlignFragment, is added, which may emit
NOP to align the fused/unfused branch.
alignBranchesBegin inserts MCBoundaryAlignFragment before instructions,
alignBranchesEnd marks the end of the branch to be aligned,
relaxBoundaryAlign grows or shrinks sizes of NOP to align the target branch.
Nop padding is disabled when the instruction may be rewritten by the linker,
such as TLS Call.
Process Note: I am landing a patch by skan as it has been LGTMed, and
continuing to iterate on the review is simply slowing us down at this point.
We can and will continue to iterate in tree.
Patch By: skan
Differential Revision: https://reviews.llvm.org/D70157
2019-12-21 02:51:05 +08:00
|
|
|
case MCFragment::FT_BoundaryAlign: {
|
|
|
|
const auto *BF = cast<MCBoundaryAlignFragment>(this);
|
|
|
|
OS << "\n ";
|
2020-03-03 11:10:54 +08:00
|
|
|
OS << " BoundarySize:" << BF->getAlignment().value()
|
[X86] Reduce the number of emitted fragments due to branch align
Summary:
Currently, a BoundaryAlign fragment may be inserted after the branch
that needs to be aligned to truncate the current fragment, this fragment is
unused at most of time. To avoid that, we can insert a new empty Data
fragment instead. Non-relaxable instruction is usually emitted into Data
fragment, so the inserted empty Data fragment will be reused at a high
possibility.
Reviewers: annita.zhang, reames, MaskRay, craig.topper, LuoYuanke, jyknight
Reviewed By: reames, LuoYuanke
Subscribers: llvm-commits, dexonsmith, hiraditya
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D75438
2020-03-03 16:54:23 +08:00
|
|
|
<< " LastFragment:" << BF->getLastFragment()
|
2020-03-03 11:10:54 +08:00
|
|
|
<< " Size:" << BF->getSize();
|
Align branches within 32-Byte boundary (NOP padding)
WARNING: If you're looking at this patch because you're looking for a full
performace mitigation of the Intel JCC Erratum, this is not it!
This is a preliminary patch on the patch towards mitigating the performance
regressions caused by Intel's microcode update for Jump Conditional Code
Erratum. For context, see:
https://www.intel.com/content/www/us/en/support/articles/000055650.html
The patch adds the required assembler infrastructure and command line options
needed to exercise the logic for INTERNAL TESTING. These are NOT public flags,
and should not be used for anything other than LLVM's own testing/debugging
purposes. They are likely to change both in spelling and meaning.
WARNING: This patch is knowingly incorrect in some cornercases. We need, and
do not yet provide, a mechanism to selective enable/disable the padding.
Conversation on this will continue in parellel with work on extending this
infrastructure to support prefix padding.
The goal here is to have the assembler align specific instructions such that
they neither cross or end at a 32 byte boundary. The impacted instructions are:
a. Conditional jump.
b. Fused conditional jump.
c. Unconditional jump.
d. Indirect jump.
e. Ret.
f. Call.
The new options for llvm-mc are:
-x86-align-branch-boundary=NUM aligns branches within NUM byte boundary.
-x86-align-branch=TYPE[+TYPE...] specifies types of branches to align.
A new MCFragment type, MCBoundaryAlignFragment, is added, which may emit
NOP to align the fused/unfused branch.
alignBranchesBegin inserts MCBoundaryAlignFragment before instructions,
alignBranchesEnd marks the end of the branch to be aligned,
relaxBoundaryAlign grows or shrinks sizes of NOP to align the target branch.
Nop padding is disabled when the instruction may be rewritten by the linker,
such as TLS Call.
Process Note: I am landing a patch by skan as it has been LGTMed, and
continuing to iterate on the review is simply slowing us down at this point.
We can and will continue to iterate in tree.
Patch By: skan
Differential Revision: https://reviews.llvm.org/D70157
2019-12-21 02:51:05 +08:00
|
|
|
break;
|
|
|
|
}
|
2017-11-09 02:57:02 +08:00
|
|
|
case MCFragment::FT_SymbolId: {
|
2019-12-23 16:29:14 +08:00
|
|
|
const auto *F = cast<MCSymbolIdFragment>(this);
|
2015-12-29 17:06:16 +08:00
|
|
|
OS << "\n ";
|
|
|
|
OS << " Sym:" << F->getSymbol();
|
|
|
|
break;
|
|
|
|
}
|
2016-02-03 01:41:18 +08:00
|
|
|
case MCFragment::FT_CVInlineLines: {
|
|
|
|
const auto *F = cast<MCCVInlineLineTableFragment>(this);
|
|
|
|
OS << "\n ";
|
|
|
|
OS << " Sym:" << *F->getFnStartSym();
|
|
|
|
break;
|
|
|
|
}
|
2016-02-05 09:55:49 +08:00
|
|
|
case MCFragment::FT_CVDefRange: {
|
|
|
|
const auto *F = cast<MCCVDefRangeFragment>(this);
|
|
|
|
OS << "\n ";
|
|
|
|
for (std::pair<const MCSymbol *, const MCSymbol *> RangeStartEnd :
|
|
|
|
F->getRanges()) {
|
|
|
|
OS << " RangeStart:" << RangeStartEnd.first;
|
|
|
|
OS << " RangeEnd:" << RangeStartEnd.second;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
[CSSPGO] Pseudo probe encoding and emission.
This change implements pseudo probe encoding and emission for CSSPGO. Please see RFC here for more context: https://groups.google.com/g/llvm-dev/c/1p1rdYbL93s
Pseudo probes are in the form of intrinsic calls on IR/MIR but they do not turn into any machine instructions. Instead they are emitted into the binary as a piece of data in standalone sections. The probe-specific sections are not needed to be loaded into memory at execution time, thus they do not incur a runtime overhead.
**ELF object emission**
The binary data to emit are organized as two ELF sections, i.e, the `.pseudo_probe_desc` section and the `.pseudo_probe` section. The `.pseudo_probe_desc` section stores a function descriptor for each function and the `.pseudo_probe` section stores the actual probes, each fo which corresponds to an IR basic block or an IR function callsite. A function descriptor is stored as a module-level metadata during the compilation and is serialized into the object file during object emission.
Both the probe descriptors and pseudo probes can be emitted into a separate ELF section per function to leverage the linker for deduplication. A `.pseudo_probe` section shares the same COMDAT group with the function code so that when the function is dead, the probes are dead and disposed too. On the contrary, a `.pseudo_probe_desc` section has its own COMDAT group. This is because even if a function is dead, its probes may be inlined into other functions and its descriptor is still needed by the profile generation tool.
The format of `.pseudo_probe_desc` section looks like:
```
.section .pseudo_probe_desc,"",@progbits
.quad 6309742469962978389 // Func GUID
.quad 4294967295 // Func Hash
.byte 9 // Length of func name
.ascii "_Z5funcAi" // Func name
.quad 7102633082150537521
.quad 138828622701
.byte 12
.ascii "_Z8funcLeafi"
.quad 446061515086924981
.quad 4294967295
.byte 9
.ascii "_Z5funcBi"
.quad -2016976694713209516
.quad 72617220756
.byte 7
.ascii "_Z3fibi"
```
For each `.pseudoprobe` section, the encoded binary data consists of a single function record corresponding to an outlined function (i.e, a function with a code entry in the `.text` section). A function record has the following format :
```
FUNCTION BODY (one for each outlined function present in the text section)
GUID (uint64)
GUID of the function
NPROBES (ULEB128)
Number of probes originating from this function.
NUM_INLINED_FUNCTIONS (ULEB128)
Number of callees inlined into this function, aka number of
first-level inlinees
PROBE RECORDS
A list of NPROBES entries. Each entry contains:
INDEX (ULEB128)
TYPE (uint4)
0 - block probe, 1 - indirect call, 2 - direct call
ATTRIBUTE (uint3)
reserved
ADDRESS_TYPE (uint1)
0 - code address, 1 - address delta
CODE_ADDRESS (uint64 or ULEB128)
code address or address delta, depending on ADDRESS_TYPE
INLINED FUNCTION RECORDS
A list of NUM_INLINED_FUNCTIONS entries describing each of the inlined
callees. Each record contains:
INLINE SITE
GUID of the inlinee (uint64)
ID of the callsite probe (ULEB128)
FUNCTION BODY
A FUNCTION BODY entry describing the inlined function.
```
To support building a context-sensitive profile, probes from inlinees are grouped by their inline contexts. An inline context is logically a call path through which a callee function lands in a caller function. The probe emitter builds an inline tree based on the debug metadata for each outlined function in the form of a trie tree. A tree root is the outlined function. Each tree edge stands for a callsite where inlining happens. Pseudo probes originating from an inlinee function are stored in a tree node and the tree path starting from the root all the way down to the tree node is the inline context of the probes. The emission happens on the whole tree top-down recursively. Probes of a tree node will be emitted altogether with their direct parent edge. Since a pseudo probe corresponds to a real code address, for size savings, the address is encoded as a delta from the previous probe except for the first probe. Variant-sized integer encoding, aka LEB128, is used for address delta and probe index.
**Assembling**
Pseudo probes can be printed as assembly directives alternatively. This allows for good assembly code readability and also provides a view of how optimizations and pseudo probes affect each other, especially helpful for diff time assembly analysis.
A pseudo probe directive has the following operands in order: function GUID, probe index, probe type, probe attributes and inline context. The directive is generated by the compiler and can be parsed by the assembler to form an encoded `.pseudoprobe` section in the object file.
A example assembly looks like:
```
foo2: # @foo2
# %bb.0: # %bb0
pushq %rax
testl %edi, %edi
.pseudoprobe 837061429793323041 1 0 0
je .LBB1_1
# %bb.2: # %bb2
.pseudoprobe 837061429793323041 6 2 0
callq foo
.pseudoprobe 837061429793323041 3 0 0
.pseudoprobe 837061429793323041 4 0 0
popq %rax
retq
.LBB1_1: # %bb1
.pseudoprobe 837061429793323041 5 1 0
callq *%rsi
.pseudoprobe 837061429793323041 2 0 0
.pseudoprobe 837061429793323041 4 0 0
popq %rax
retq
# -- End function
.section .pseudo_probe_desc,"",@progbits
.quad 6699318081062747564
.quad 72617220756
.byte 3
.ascii "foo"
.quad 837061429793323041
.quad 281547593931412
.byte 4
.ascii "foo2"
```
With inlining turned on, the assembly may look different around %bb2 with an inlined probe:
```
# %bb.2: # %bb2
.pseudoprobe 837061429793323041 3 0
.pseudoprobe 6699318081062747564 1 0 @ 837061429793323041:6
.pseudoprobe 837061429793323041 4 0
popq %rax
retq
```
**Disassembling**
We have a disassembling tool (llvm-profgen) that can display disassembly alongside with pseudo probes. So far it only supports ELF executable file.
An example disassembly looks like:
```
00000000002011a0 <foo2>:
2011a0: 50 push rax
2011a1: 85 ff test edi,edi
[Probe]: FUNC: foo2 Index: 1 Type: Block
2011a3: 74 02 je 2011a7 <foo2+0x7>
[Probe]: FUNC: foo2 Index: 3 Type: Block
[Probe]: FUNC: foo2 Index: 4 Type: Block
[Probe]: FUNC: foo Index: 1 Type: Block Inlined: @ foo2:6
2011a5: 58 pop rax
2011a6: c3 ret
[Probe]: FUNC: foo2 Index: 2 Type: Block
2011a7: bf 01 00 00 00 mov edi,0x1
[Probe]: FUNC: foo2 Index: 5 Type: IndirectCall
2011ac: ff d6 call rsi
[Probe]: FUNC: foo2 Index: 4 Type: Block
2011ae: 58 pop rax
2011af: c3 ret
```
Reviewed By: wmi
Differential Revision: https://reviews.llvm.org/D91878
2020-12-09 07:37:32 +08:00
|
|
|
case MCFragment::FT_PseudoProbe: {
|
|
|
|
const auto *OF = cast<MCPseudoProbeAddrFragment>(this);
|
|
|
|
OS << "\n ";
|
|
|
|
OS << " AddrDelta:" << OF->getAddrDelta();
|
|
|
|
break;
|
|
|
|
}
|
2015-12-29 17:06:16 +08:00
|
|
|
case MCFragment::FT_Dummy:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
OS << ">";
|
|
|
|
}
|
2017-01-28 10:02:38 +08:00
|
|
|
#endif
|