[BOLT] [AArch64] Handle constant islands spanning multiple functions

Fix BOLT's constant island mapping when a constant island marked by $d
spans multiple functions. Currently, because BOLT only marks the
constant island in the first function where $d is located, if the next
function contains data at its start, BOLT will miss the data and try
to disassemble it. This patch adds code to explicitly go through all
symbols between $d and $x markers and mark their respective offsets as
data, which stops BOLT from trying to disassemble data. It also adds
MarkerType enum and refactors related functions.

Reviewed By: yota9, rafauler

Differential Revision: https://reviews.llvm.org/D126177
This commit is contained in:
Denis Revunov 2022-05-31 11:50:59 -07:00 committed by Rafael Auler
parent 218393f44e
commit 8579db96e8
7 changed files with 243 additions and 70 deletions

View File

@ -82,6 +82,13 @@ inline raw_ostream &operator<<(raw_ostream &OS, const SegmentInfo &SegInfo) {
return OS;
}
// AArch64-specific symbol markers used to delimit code/data in .text.
enum class MarkerSymType : char {
NONE = 0,
CODE,
DATA,
};
enum class MemoryContentsType : char {
UNKNOWN = 0, /// Unknown contents.
POSSIBLE_JUMP_TABLE, /// Possibly a non-PIC jump table.
@ -662,6 +669,11 @@ public:
TheTriple->getArch() == llvm::Triple::x86_64;
}
// AArch64-specific functions to check if symbol is used to delimit
// code/data in .text. Code is marked by $x, data by $d.
MarkerSymType getMarkerType(const SymbolRef &Symbol) const;
bool isMarker(const SymbolRef &Symbol) const;
/// Iterate over all BinaryData.
iterator_range<binary_data_const_iterator> getBinaryData() const {
return make_range(BinaryDataMap.begin(), BinaryDataMap.end());

View File

@ -1948,11 +1948,6 @@ public:
return ColdLSDASymbol;
}
/// True if the symbol is a mapping symbol used in AArch64 to delimit
/// data inside code section.
bool isDataMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const;
bool isCodeMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const;
void setOutputDataAddress(uint64_t Address) { OutputDataOffset = Address; }
uint64_t getOutputDataAddress() const { return OutputDataOffset; }

View File

@ -1639,6 +1639,35 @@ void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) {
}
}
MarkerSymType BinaryContext::getMarkerType(const SymbolRef &Symbol) const {
// For aarch64, the ABI defines mapping symbols so we identify data in the
// code section (see IHI0056B). $x identifies a symbol starting code or the
// end of a data chunk inside code, $d indentifies start of data.
if (!isAArch64() || ELFSymbolRef(Symbol).getSize())
return MarkerSymType::NONE;
Expected<StringRef> NameOrError = Symbol.getName();
Expected<object::SymbolRef::Type> TypeOrError = Symbol.getType();
if (!TypeOrError || !NameOrError)
return MarkerSymType::NONE;
if (*TypeOrError != SymbolRef::ST_Unknown)
return MarkerSymType::NONE;
if (*NameOrError == "$x" || NameOrError->startswith("$x."))
return MarkerSymType::CODE;
if (*NameOrError == "$d" || NameOrError->startswith("$d."))
return MarkerSymType::DATA;
return MarkerSymType::NONE;
}
bool BinaryContext::isMarker(const SymbolRef &Symbol) const {
return getMarkerType(Symbol) != MarkerSymType::NONE;
}
void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction,
uint64_t Offset,
const BinaryFunction *Function,

View File

@ -3926,33 +3926,6 @@ void BinaryFunction::deleteConservativeEdges() {
}
}
bool BinaryFunction::isDataMarker(const SymbolRef &Symbol,
uint64_t SymbolSize) const {
// For aarch64, the ABI defines mapping symbols so we identify data in the
// code section (see IHI0056B). $d identifies a symbol starting data contents.
if (BC.isAArch64() && Symbol.getType() &&
cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && SymbolSize == 0 &&
Symbol.getName() &&
(cantFail(Symbol.getName()) == "$d" ||
cantFail(Symbol.getName()).startswith("$d.")))
return true;
return false;
}
bool BinaryFunction::isCodeMarker(const SymbolRef &Symbol,
uint64_t SymbolSize) const {
// For aarch64, the ABI defines mapping symbols so we identify data in the
// code section (see IHI0056B). $x identifies a symbol starting code or the
// end of a data chunk inside code.
if (BC.isAArch64() && Symbol.getType() &&
cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && SymbolSize == 0 &&
Symbol.getName() &&
(cantFail(Symbol.getName()) == "$x" ||
cantFail(Symbol.getName()).startswith("$x.")))
return true;
return false;
}
bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol,
uint64_t SymbolSize) const {
// If this symbol is in a different section from the one where the
@ -3963,7 +3936,7 @@ bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol,
// Some symbols are tolerated inside function bodies, others are not.
// The real function boundaries may not be known at this point.
if (isDataMarker(Symbol, SymbolSize) || isCodeMarker(Symbol, SymbolSize))
if (BC.isMarker(Symbol))
return true;
// It's okay to have a zero-sized symbol in the middle of non-zero-sized

View File

@ -880,47 +880,88 @@ void RewriteInstance::discoverFileObjects() {
std::vector<SymbolRef> SortedFileSymbols;
std::copy_if(InputFile->symbol_begin(), InputFile->symbol_end(),
std::back_inserter(SortedFileSymbols), isSymbolInMemory);
auto CompareSymbols = [this](const SymbolRef &A, const SymbolRef &B) {
// Marker symbols have the highest precedence, while
// SECTIONs have the lowest.
auto AddressA = cantFail(A.getAddress());
auto AddressB = cantFail(B.getAddress());
if (AddressA != AddressB)
return AddressA < AddressB;
std::stable_sort(
SortedFileSymbols.begin(), SortedFileSymbols.end(),
[](const SymbolRef &A, const SymbolRef &B) {
// FUNC symbols have the highest precedence, while SECTIONs
// have the lowest.
uint64_t AddressA = cantFail(A.getAddress());
uint64_t AddressB = cantFail(B.getAddress());
if (AddressA != AddressB)
return AddressA < AddressB;
bool AMarker = BC->isMarker(A);
bool BMarker = BC->isMarker(B);
if (AMarker || BMarker) {
return AMarker && !BMarker;
}
SymbolRef::Type AType = cantFail(A.getType());
SymbolRef::Type BType = cantFail(B.getType());
if (AType == SymbolRef::ST_Function && BType != SymbolRef::ST_Function)
return true;
if (BType == SymbolRef::ST_Debug && AType != SymbolRef::ST_Debug)
return true;
auto AType = cantFail(A.getType());
auto BType = cantFail(B.getType());
if (AType == SymbolRef::ST_Function && BType != SymbolRef::ST_Function)
return true;
if (BType == SymbolRef::ST_Debug && AType != SymbolRef::ST_Debug)
return true;
return false;
});
return false;
};
std::stable_sort(SortedFileSymbols.begin(), SortedFileSymbols.end(),
CompareSymbols);
auto LastSymbol = SortedFileSymbols.end() - 1;
// For aarch64, the ABI defines mapping symbols so we identify data in the
// code section (see IHI0056B). $d identifies data contents.
auto LastSymbol = SortedFileSymbols.end() - 1;
// Compilers usually merge multiple data objects in a single $d-$x interval,
// but we need every data object to be marked with $d. Because of that we
// create a vector of MarkerSyms with all locations of data objects.
struct MarkerSym {
uint64_t Address;
MarkerSymType Type;
};
std::vector<MarkerSym> SortedMarkerSymbols;
auto addExtraDataMarkerPerSymbol =
[this](const std::vector<SymbolRef> &SortedFileSymbols,
std::vector<MarkerSym> &SortedMarkerSymbols) {
bool IsData = false;
uint64_t LastAddr = 0;
for (auto Sym = SortedFileSymbols.begin();
Sym < SortedFileSymbols.end(); ++Sym) {
uint64_t Address = cantFail(Sym->getAddress());
if (LastAddr == Address) // don't repeat markers
continue;
MarkerSymType MarkerType = BC->getMarkerType(*Sym);
if (MarkerType != MarkerSymType::NONE) {
SortedMarkerSymbols.push_back(MarkerSym{Address, MarkerType});
LastAddr = Address;
IsData = MarkerType == MarkerSymType::DATA;
continue;
}
if (IsData) {
SortedMarkerSymbols.push_back(
MarkerSym{cantFail(Sym->getAddress()), MarkerSymType::DATA});
LastAddr = Address;
}
}
};
if (BC->isAArch64()) {
addExtraDataMarkerPerSymbol(SortedFileSymbols, SortedMarkerSymbols);
LastSymbol = std::stable_partition(
SortedFileSymbols.begin(), SortedFileSymbols.end(),
[](const SymbolRef &Symbol) {
StringRef Name = cantFail(Symbol.getName());
return !(cantFail(Symbol.getType()) == SymbolRef::ST_Unknown &&
(Name == "$d" || Name.startswith("$d.") || Name == "$x" ||
Name.startswith("$x.")));
});
[this](const SymbolRef &Symbol) { return !BC->isMarker(Symbol); });
--LastSymbol;
}
BinaryFunction *PreviousFunction = nullptr;
unsigned AnonymousId = 0;
const auto MarkersBegin = std::next(LastSymbol);
for (auto ISym = SortedFileSymbols.begin(); ISym != MarkersBegin; ++ISym) {
const auto SortedSymbolsEnd = std::next(LastSymbol);
for (auto ISym = SortedFileSymbols.begin(); ISym != SortedSymbolsEnd;
++ISym) {
const SymbolRef &Symbol = *ISym;
// Keep undefined symbols for pretty printing?
if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Undefined)
@ -1213,25 +1254,24 @@ void RewriteInstance::discoverFileObjects() {
adjustFunctionBoundaries();
// Annotate functions with code/data markers in AArch64
for (auto ISym = MarkersBegin; ISym != SortedFileSymbols.end(); ++ISym) {
const SymbolRef &Symbol = *ISym;
uint64_t Address =
cantFail(Symbol.getAddress(), "cannot get symbol address");
uint64_t SymbolSize = ELFSymbolRef(Symbol).getSize();
BinaryFunction *BF =
BC->getBinaryFunctionContainingAddress(Address, true, true);
for (auto ISym = SortedMarkerSymbols.begin();
ISym != SortedMarkerSymbols.end(); ++ISym) {
auto *BF =
BC->getBinaryFunctionContainingAddress(ISym->Address, true, true);
if (!BF) {
// Stray marker
continue;
}
const uint64_t EntryOffset = Address - BF->getAddress();
if (BF->isCodeMarker(Symbol, SymbolSize)) {
const auto EntryOffset = ISym->Address - BF->getAddress();
if (ISym->Type == MarkerSymType::CODE) {
BF->markCodeAtOffset(EntryOffset);
continue;
}
if (BF->isDataMarker(Symbol, SymbolSize)) {
if (ISym->Type == MarkerSymType::DATA) {
BF->markDataAtOffset(EntryOffset);
BC->AddressToConstantIslandMap[Address] = BF;
BC->AddressToConstantIslandMap[ISym->Address] = BF;
continue;
}
llvm_unreachable("Unknown marker");

View File

@ -0,0 +1,90 @@
--- !ELF
FileHeader:
Class: ELFCLASS64
Data: ELFDATA2LSB
Type: ET_EXEC
Machine: EM_AARCH64
Entry: 0x210134
ProgramHeaders:
- Type: PT_PHDR
Flags: [ PF_R ]
VAddr: 0x200040
Align: 0x8
FileSize: 0x0000e0
MemSize: 0x0000e0
Offset: 0x000040
- Type: PT_LOAD
Flags: [ PF_R ]
VAddr: 0x200000
Align: 0x10000
FileSize: 0x000120
MemSize: 0x000120
Offset: 0x000000
- Type: PT_LOAD
Flags: [ PF_X, PF_R ]
FirstSec: .text
LastSec: .text
VAddr: 0x210120
Align: 0x10000
- Type: PT_GNU_STACK
Flags: [ PF_W, PF_R ]
Align: 0x0
Sections:
- Name: .text
Type: SHT_PROGBITS
Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
Address: 0x210120
AddressAlign: 0x4
Content: 030F0B0700000000030F0B0700000000C0035FD6FFFFFF97000080D2A80B8052010000D4
- Name: .rela.text
Type: SHT_RELA
Flags: [ SHF_INFO_LINK ]
Link: .symtab
AddressAlign: 0x8
Info: .text
Relocations:
- Offset: 0x210134
Symbol: dummy
Type: R_AARCH64_CALL26
- Name: .comment
Type: SHT_PROGBITS
Flags: [ SHF_MERGE, SHF_STRINGS ]
AddressAlign: 0x1
EntSize: 0x1
Content: 4C696E6B65723A204C4C442031352E302E3000
Symbols:
- Name: val
Index: SHN_ABS
Value: 0x70B0F03
- Name: first
Section: .text
Value: 0x210120
Size: 0x8
- Name: '$d.0'
Section: .text
Value: 0x210120
- Name: second
Section: .text
Value: 0x210128
Size: 0x8
- Name: '$x.1'
Section: .text
Value: 0x210130
- Name: .text
Type: STT_SECTION
Section: .text
Value: 0x210120
- Name: .comment
Type: STT_SECTION
Section: .comment
- Name: dummy
Type: STT_FUNC
Section: .text
Binding: STB_GLOBAL
Value: 0x210130
- Name: _start
Type: STT_FUNC
Section: .text
Binding: STB_GLOBAL
Value: 0x210134
...

View File

@ -0,0 +1,34 @@
// This test checks that multiple data objects in text of which only first is marked get disassembled properly
// RUN: yaml2obj %S/Inputs/unmarked-data.yaml -o %t.exe
// RUN: llvm-bolt %t.exe -o %t.bolt -lite=0 -use-old-text=0 2>&1 | FileCheck %s
// CHECK-NOT: BOLT-WARNING
// RUN: llvm-objdump -j .text -d --disassemble-symbols=first,second %t.bolt | FileCheck %s -check-prefix=CHECK-SYMBOL
// CHECK-SYMBOL: <first>:
// CHECK-SYMBOL: <second>:
// YAML is based in the following assembly:
.equ val, 0x070b0f03 // we use constant that is not a valid instruction so that it can't be silently dissassembled
.text
first:
.xword val
.size first, .-first
second:
.xword val
.size second, .-second
.globl dummy
.type dummy, %function
dummy: // dummy function to force relocations
ret
.globl _start
.type _start, %function
_start:
bl dummy
mov x0, #0
mov w8, #93
svc #0