2020-04-22 04:37:57 +08:00
|
|
|
//===- SyntheticSections.cpp ---------------------------------------------===//
|
|
|
|
//
|
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "SyntheticSections.h"
|
2021-05-26 02:57:16 +08:00
|
|
|
#include "ConcatOutputSection.h"
|
2020-04-29 07:58:22 +08:00
|
|
|
#include "Config.h"
|
2020-04-30 06:42:19 +08:00
|
|
|
#include "ExportTrie.h"
|
2020-04-28 03:50:59 +08:00
|
|
|
#include "InputFiles.h"
|
2020-05-22 06:26:35 +08:00
|
|
|
#include "MachOStructs.h"
|
2020-04-28 03:50:59 +08:00
|
|
|
#include "OutputSegment.h"
|
[lld-macho][reland] Add basic symbol table output
This diff implements basic support for writing a symbol table.
Attributes are loosely supported for extern symbols and not at all for
other types.
Initial version by Kellie Medlin <kelliem@fb.com>
Originally committed in a3d95a50ee33 and reverted in fbae153ca583 due to
UBSAN erroring over unaligned writes. That has been fixed in the
current diff with the following changes:
```
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -133,6 +133,9 @@ SymtabSection::SymtabSection(StringTableSection &stringTableSection)
: stringTableSection(stringTableSection) {
segname = segment_names::linkEdit;
name = section_names::symbolTable;
+ // TODO: When we introduce the SyntheticSections superclass, we should make
+ // all synthetic sections aligned to WordSize by default.
+ align = WordSize;
}
size_t SymtabSection::getSize() const {
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -371,6 +371,7 @@ void Writer::assignAddresses(OutputSegment *seg) {
ArrayRef<InputSection *> sections = p.second;
for (InputSection *isec : sections) {
addr = alignTo(addr, isec->align);
+ // We must align the file offsets too to avoid misaligned writes of
+ // structs.
+ fileOff = alignTo(fileOff, isec->align);
isec->addr = addr;
addr += isec->getSize();
fileOff += isec->getFileSize();
@@ -396,6 +397,7 @@ void Writer::writeSections() {
uint64_t fileOff = seg->fileOff;
for (auto § : seg->getSections()) {
for (InputSection *isec : sect.second) {
+ fileOff = alignTo(fileOff, isec->align);
isec->writeTo(buf + fileOff);
fileOff += isec->getFileSize();
}
```
I don't think it's easy to write a test for alignment (that doesn't
involve brittly hard-coding file offsets), so there isn't one... but
UBSAN builds pass now.
Differential Revision: https://reviews.llvm.org/D79050
2020-04-29 07:58:19 +08:00
|
|
|
#include "SymbolTable.h"
|
2020-04-22 04:37:57 +08:00
|
|
|
#include "Symbols.h"
|
|
|
|
|
2022-01-21 03:53:18 +08:00
|
|
|
#include "lld/Common/CommonLinkerContext.h"
|
2020-12-02 06:45:09 +08:00
|
|
|
#include "llvm/ADT/STLExtras.h"
|
2021-05-19 23:07:39 +08:00
|
|
|
#include "llvm/Config/llvm-config.h"
|
2020-04-28 03:50:59 +08:00
|
|
|
#include "llvm/Support/EndianStream.h"
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
#include "llvm/Support/FileSystem.h"
|
2020-04-28 03:50:59 +08:00
|
|
|
#include "llvm/Support/LEB128.h"
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
#include "llvm/Support/Path.h"
|
2021-10-02 05:30:21 +08:00
|
|
|
|
|
|
|
#if defined(__APPLE__)
|
|
|
|
#include <sys/mman.h>
|
2022-06-22 01:49:26 +08:00
|
|
|
|
|
|
|
#define COMMON_DIGEST_FOR_OPENSSL
|
|
|
|
#include <CommonCrypto/CommonDigest.h>
|
|
|
|
#else
|
|
|
|
#include "llvm/Support/SHA256.h"
|
2021-10-02 05:30:21 +08:00
|
|
|
#endif
|
2020-04-28 03:50:59 +08:00
|
|
|
|
2021-05-19 23:07:39 +08:00
|
|
|
#ifdef LLVM_HAVE_LIBXAR
|
2021-04-17 04:46:45 +08:00
|
|
|
#include <fcntl.h>
|
2021-09-23 15:25:36 +08:00
|
|
|
extern "C" {
|
2021-04-17 04:46:45 +08:00
|
|
|
#include <xar/xar.h>
|
2021-09-23 15:25:36 +08:00
|
|
|
}
|
2021-04-17 04:46:45 +08:00
|
|
|
#endif
|
|
|
|
|
2020-04-28 03:50:59 +08:00
|
|
|
using namespace llvm;
|
2021-03-12 02:28:08 +08:00
|
|
|
using namespace llvm::MachO;
|
2020-04-28 03:50:59 +08:00
|
|
|
using namespace llvm::support;
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
using namespace llvm::support::endian;
|
2020-06-25 03:22:13 +08:00
|
|
|
using namespace lld;
|
|
|
|
using namespace lld::macho;
|
2020-04-22 04:37:57 +08:00
|
|
|
|
2022-06-22 01:46:40 +08:00
|
|
|
// Reads `len` bytes at data and writes the 32-byte SHA256 checksum to `output`.
|
|
|
|
static void sha256(const uint8_t *data, size_t len, uint8_t *output) {
|
2022-06-22 01:49:26 +08:00
|
|
|
#if defined(__APPLE__)
|
|
|
|
// FIXME: Make LLVM's SHA256 faster and use it unconditionally. See PR56121
|
|
|
|
// for some notes on this.
|
|
|
|
CC_SHA256(data, len, output);
|
|
|
|
#else
|
2022-06-22 01:46:40 +08:00
|
|
|
ArrayRef<uint8_t> block(data, len);
|
|
|
|
std::array<uint8_t, 32> hash = SHA256::hash(block);
|
|
|
|
assert(hash.size() == CodeSignatureSection::hashSize);
|
|
|
|
memcpy(output, hash.data(), hash.size());
|
2022-06-22 01:49:26 +08:00
|
|
|
#endif
|
2022-06-22 01:46:40 +08:00
|
|
|
}
|
|
|
|
|
2020-06-25 03:22:13 +08:00
|
|
|
InStruct macho::in;
|
|
|
|
std::vector<SyntheticSection *> macho::syntheticSections;
|
2020-04-22 04:37:57 +08:00
|
|
|
|
2020-05-02 07:29:06 +08:00
|
|
|
SyntheticSection::SyntheticSection(const char *segname, const char *name)
|
2021-07-01 06:55:36 +08:00
|
|
|
: OutputSection(SyntheticKind, name) {
|
|
|
|
std::tie(this->segname, this->name) = maybeRenameSection({segname, name});
|
[lld-macho][nfc] Eliminate InputSection::Shared
Earlier in LLD's evolution, I tried to create the illusion that
subsections were indistinguishable from "top-level" sections. Thus, even
though the subsections shared many common field values, I hid those
common values away in a private Shared struct (see D105305). More
recently, however, @gkm added a public `Section` struct in D113241 that
served as an explicit way to store values that are common to an entire
set of subsections (aka InputSections). Now that we have another "common
value" struct, `Shared` has been rendered redundant. All its fields can
be moved into `Section` instead, and the pointer to `Shared` can be replaced
with a pointer to `Section`.
This `Section` pointer also has the advantage of letting us inspect other
subsections easily, simplifying the implementation of {D118798}.
P.S. I do think that having both `Section` and `InputSection` makes for
a slightly confusing naming scheme. I considered renaming `InputSection`
to `Subsection`, but that would break the symmetry with `OutputSection`.
It would also make us deviate from LLD-ELF's naming scheme.
This change is perf-neutral on my 3.2 GHz 16-Core Intel Xeon W machine:
base diff difference (95% CI)
sys_time 1.258 ± 0.031 1.248 ± 0.023 [ -1.6% .. +0.1%]
user_time 3.659 ± 0.047 3.658 ± 0.041 [ -0.5% .. +0.4%]
wall_time 4.640 ± 0.085 4.625 ± 0.063 [ -1.0% .. +0.3%]
samples 49 61
There's also no stat sig change in RSS (as measured by `time -l`):
base diff difference (95% CI)
time 998038627.097 ± 13567305.958 1003327715.556 ± 15210451.236 [ -0.2% .. +1.2%]
samples 31 36
Reviewed By: #lld-macho, oontvoo
Differential Revision: https://reviews.llvm.org/D118797
2022-02-04 08:53:29 +08:00
|
|
|
isec = makeSyntheticInputSection(segname, name);
|
2021-03-13 06:26:12 +08:00
|
|
|
isec->parent = this;
|
[lld-macho] Refactor segment/section creation, sorting, and merging
Summary:
There were a few issues with the previous setup:
1. The section sorting comparator used a declarative map of section names to
determine the correct order, but it turns out we need to match on more than
just names -- in particular, an upcoming diff will sort based on whether the
S_ZERO_FILL flag is set. This diff changes the sorter to a more imperative but
flexible form.
2. We were sorting OutputSections stored in a MapVector, which left the
MapVector in an inconsistent state -- the wrong keys map to the wrong values!
In practice, we weren't doing key lookups (only container iteration) after the
sort, so this was fine, but it was still a dubious state of affairs. This diff
copies the OutputSections to a vector before sorting them.
3. We were adding unneeded OutputSections to OutputSegments and then filtering
them out later, which meant that we had to remember whether an OutputSegment
was in a pre- or post-filtered state. This diff only adds the sections to the
segments if they are needed.
In addition to those major changes, two minor ones worth noting:
1. I renamed all OutputSection variable names to `osec`, to parallel `isec`.
Previously we were using some inconsistent combination of `osec`, `os`, and
`section`.
2. I added a check (and a test) for InputSections with names that clashed with
those of our synthetic OutputSections.
Reviewers: #lld-macho
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D81887
2020-06-15 15:03:24 +08:00
|
|
|
syntheticSections.push_back(this);
|
2020-04-28 03:50:59 +08:00
|
|
|
}
|
|
|
|
|
2020-05-02 07:29:06 +08:00
|
|
|
// dyld3's MachOLoaded::getSlide() assumes that the __TEXT segment starts
|
|
|
|
// from the beginning of the file (i.e. the header).
|
|
|
|
MachHeaderSection::MachHeaderSection()
|
2021-03-19 06:49:45 +08:00
|
|
|
: SyntheticSection(segment_names::text, section_names::header) {
|
|
|
|
// XXX: This is a hack. (See D97007)
|
|
|
|
// Setting the index to 1 to pretend that this section is the text
|
|
|
|
// section.
|
|
|
|
index = 1;
|
2021-03-30 08:33:48 +08:00
|
|
|
isec->isFinal = true;
|
2021-03-19 06:49:45 +08:00
|
|
|
}
|
2020-05-02 07:29:06 +08:00
|
|
|
|
2020-04-28 03:50:59 +08:00
|
|
|
void MachHeaderSection::addLoadCommand(LoadCommand *lc) {
|
|
|
|
loadCommands.push_back(lc);
|
|
|
|
sizeOfCmds += lc->getSize();
|
|
|
|
}
|
|
|
|
|
2021-05-04 06:31:23 +08:00
|
|
|
uint64_t MachHeaderSection::getSize() const {
|
|
|
|
uint64_t size = target->headerSize + sizeOfCmds + config->headerPad;
|
2021-04-22 01:35:12 +08:00
|
|
|
// If we are emitting an encryptable binary, our load commands must have a
|
|
|
|
// separate (non-encrypted) page to themselves.
|
|
|
|
if (config->emitEncryptionInfo)
|
|
|
|
size = alignTo(size, target->getPageSize());
|
|
|
|
return size;
|
2020-04-28 03:50:59 +08:00
|
|
|
}
|
|
|
|
|
2021-02-23 01:12:39 +08:00
|
|
|
static uint32_t cpuSubtype() {
|
|
|
|
uint32_t subtype = target->cpuSubtype;
|
|
|
|
|
2021-03-12 02:28:08 +08:00
|
|
|
if (config->outputType == MH_EXECUTE && !config->staticLink &&
|
|
|
|
target->cpuSubtype == CPU_SUBTYPE_X86_64_ALL &&
|
2022-01-13 06:01:59 +08:00
|
|
|
config->platform() == PLATFORM_MACOS &&
|
2021-03-05 03:36:47 +08:00
|
|
|
config->platformInfo.minimum >= VersionTuple(10, 5))
|
2021-03-12 02:28:08 +08:00
|
|
|
subtype |= CPU_SUBTYPE_LIB64;
|
2021-02-23 01:12:39 +08:00
|
|
|
|
|
|
|
return subtype;
|
|
|
|
}
|
|
|
|
|
2021-05-04 06:31:23 +08:00
|
|
|
void MachHeaderSection::writeTo(uint8_t *buf) const {
|
|
|
|
auto *hdr = reinterpret_cast<mach_header *>(buf);
|
|
|
|
hdr->magic = target->magic;
|
2020-09-27 04:00:22 +08:00
|
|
|
hdr->cputype = target->cpuType;
|
2021-02-23 01:12:39 +08:00
|
|
|
hdr->cpusubtype = cpuSubtype();
|
2020-04-29 07:58:22 +08:00
|
|
|
hdr->filetype = config->outputType;
|
2020-04-28 03:50:59 +08:00
|
|
|
hdr->ncmds = loadCommands.size();
|
|
|
|
hdr->sizeofcmds = sizeOfCmds;
|
2021-03-12 02:28:08 +08:00
|
|
|
hdr->flags = MH_DYLDLINK;
|
2021-03-02 04:25:10 +08:00
|
|
|
|
|
|
|
if (config->namespaceKind == NamespaceKind::twolevel)
|
2021-03-12 02:28:08 +08:00
|
|
|
hdr->flags |= MH_NOUNDEFS | MH_TWOLEVEL;
|
2020-08-08 02:04:52 +08:00
|
|
|
|
2021-03-12 02:28:08 +08:00
|
|
|
if (config->outputType == MH_DYLIB && !config->hasReexports)
|
|
|
|
hdr->flags |= MH_NO_REEXPORTED_DYLIBS;
|
2020-04-28 03:50:59 +08:00
|
|
|
|
2021-03-09 23:17:01 +08:00
|
|
|
if (config->markDeadStrippableDylib)
|
2021-03-12 02:28:08 +08:00
|
|
|
hdr->flags |= MH_DEAD_STRIPPABLE_DYLIB;
|
2021-03-09 23:17:01 +08:00
|
|
|
|
2021-03-12 02:28:08 +08:00
|
|
|
if (config->outputType == MH_EXECUTE && config->isPic)
|
|
|
|
hdr->flags |= MH_PIE;
|
2020-09-06 01:55:33 +08:00
|
|
|
|
2021-07-12 22:26:54 +08:00
|
|
|
if (config->outputType == MH_DYLIB && config->applicationExtension)
|
|
|
|
hdr->flags |= MH_APP_EXTENSION_SAFE;
|
|
|
|
|
2020-08-28 06:59:30 +08:00
|
|
|
if (in.exports->hasWeakSymbol || in.weakBinding->hasNonWeakDefinition())
|
2021-03-12 02:28:08 +08:00
|
|
|
hdr->flags |= MH_WEAK_DEFINES;
|
2020-08-28 06:59:15 +08:00
|
|
|
|
2020-08-28 06:59:30 +08:00
|
|
|
if (in.exports->hasWeakSymbol || in.weakBinding->hasEntry())
|
2021-03-12 02:28:08 +08:00
|
|
|
hdr->flags |= MH_BINDS_TO_WEAK;
|
2020-08-28 06:59:15 +08:00
|
|
|
|
2021-03-10 13:41:34 +08:00
|
|
|
for (const OutputSegment *seg : outputSegments) {
|
|
|
|
for (const OutputSection *osec : seg->getSections()) {
|
2020-08-08 02:04:52 +08:00
|
|
|
if (isThreadLocalVariables(osec->flags)) {
|
2021-03-12 02:28:08 +08:00
|
|
|
hdr->flags |= MH_HAS_TLV_DESCRIPTORS;
|
2020-08-08 02:04:52 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-05-04 06:31:23 +08:00
|
|
|
uint8_t *p = reinterpret_cast<uint8_t *>(hdr) + target->headerSize;
|
2021-03-10 13:41:34 +08:00
|
|
|
for (const LoadCommand *lc : loadCommands) {
|
2020-04-28 03:50:59 +08:00
|
|
|
lc->writeTo(p);
|
|
|
|
p += lc->getSize();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-02 07:29:06 +08:00
|
|
|
PageZeroSection::PageZeroSection()
|
|
|
|
: SyntheticSection(segment_names::pageZero, section_names::pageZero) {}
|
2020-04-28 03:50:59 +08:00
|
|
|
|
2020-09-06 01:55:33 +08:00
|
|
|
RebaseSection::RebaseSection()
|
|
|
|
: LinkEditSection(segment_names::linkEdit, section_names::rebase) {}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
struct Rebase {
|
|
|
|
OutputSegment *segment = nullptr;
|
|
|
|
uint64_t offset = 0;
|
|
|
|
uint64_t consecutiveCount = 0;
|
|
|
|
};
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
// Rebase opcodes allow us to describe a contiguous sequence of rebase location
|
|
|
|
// using a single DO_REBASE opcode. To take advantage of it, we delay emitting
|
|
|
|
// `DO_REBASE` until we have reached the end of a contiguous sequence.
|
|
|
|
static void encodeDoRebase(Rebase &rebase, raw_svector_ostream &os) {
|
|
|
|
assert(rebase.consecutiveCount != 0);
|
|
|
|
if (rebase.consecutiveCount <= REBASE_IMMEDIATE_MASK) {
|
|
|
|
os << static_cast<uint8_t>(REBASE_OPCODE_DO_REBASE_IMM_TIMES |
|
|
|
|
rebase.consecutiveCount);
|
|
|
|
} else {
|
|
|
|
os << static_cast<uint8_t>(REBASE_OPCODE_DO_REBASE_ULEB_TIMES);
|
|
|
|
encodeULEB128(rebase.consecutiveCount, os);
|
|
|
|
}
|
|
|
|
rebase.consecutiveCount = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void encodeRebase(const OutputSection *osec, uint64_t outSecOff,
|
|
|
|
Rebase &lastRebase, raw_svector_ostream &os) {
|
|
|
|
OutputSegment *seg = osec->parent;
|
|
|
|
uint64_t offset = osec->getSegmentOffset() + outSecOff;
|
|
|
|
if (lastRebase.segment != seg || lastRebase.offset != offset) {
|
|
|
|
if (lastRebase.consecutiveCount != 0)
|
|
|
|
encodeDoRebase(lastRebase, os);
|
|
|
|
|
|
|
|
if (lastRebase.segment != seg) {
|
|
|
|
os << static_cast<uint8_t>(REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB |
|
|
|
|
seg->index);
|
|
|
|
encodeULEB128(offset, os);
|
|
|
|
lastRebase.segment = seg;
|
|
|
|
lastRebase.offset = offset;
|
|
|
|
} else {
|
|
|
|
assert(lastRebase.offset != offset);
|
|
|
|
os << static_cast<uint8_t>(REBASE_OPCODE_ADD_ADDR_ULEB);
|
|
|
|
encodeULEB128(offset - lastRebase.offset, os);
|
|
|
|
lastRebase.offset = offset;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
++lastRebase.consecutiveCount;
|
|
|
|
// DO_REBASE causes dyld to both perform the binding and increment the offset
|
2021-04-03 06:46:18 +08:00
|
|
|
lastRebase.offset += target->wordSize;
|
2020-09-06 01:55:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void RebaseSection::finalizeContents() {
|
|
|
|
if (locations.empty())
|
|
|
|
return;
|
|
|
|
|
|
|
|
raw_svector_ostream os{contents};
|
|
|
|
Rebase lastRebase;
|
|
|
|
|
|
|
|
os << static_cast<uint8_t>(REBASE_OPCODE_SET_TYPE_IMM | REBASE_TYPE_POINTER);
|
|
|
|
|
|
|
|
llvm::sort(locations, [](const Location &a, const Location &b) {
|
[lld-macho] Implement cstring deduplication
Our implementation draws heavily from LLD-ELF's, which in turn delegates
its string deduplication to llvm-mc's StringTableBuilder. The messiness of
this diff is largely due to the fact that we've previously assumed that
all InputSections get concatenated together to form the output. This is
no longer true with CStringInputSections, which split their contents into
StringPieces. StringPieces are much more lightweight than InputSections,
which is important as we create a lot of them. They may also overlap in
the output, which makes it possible for strings to be tail-merged. In
fact, the initial version of this diff implemented tail merging, but
I've dropped it for reasons I'll explain later.
**Alignment Issues**
Mergeable cstring literals are found under the `__TEXT,__cstring`
section. In contrast to ELF, which puts strings that need different
alignments into different sections, clang's Mach-O backend puts them all
in one section. Strings that need to be aligned have the `.p2align`
directive emitted before them, which simply translates into zero padding
in the object file.
I *think* ld64 extracts the desired per-string alignment from this data
by preserving each string's offset from the last section-aligned
address. I'm not entirely certain since it doesn't seem consistent about
doing this; but perhaps this can be chalked up to cases where ld64 has
to deduplicate strings with different offset/alignment combos -- it
seems to pick one of their alignments to preserve. This doesn't seem
correct in general; we can in fact can induce ld64 to produce a crashing
binary just by linking in an additional object file that only contains
cstrings and no code. See PR50563 for details.
Moreover, this scheme seems rather inefficient: since unaligned and
aligned strings are all put in the same section, which has a single
alignment value, it doesn't seem possible to tell whether a given string
doesn't have any alignment requirements. Preserving offset+alignments
for strings that don't need it is wasteful.
In practice, the crashes seen so far seem to stem from x86_64 SIMD
operations on cstrings. X86_64 requires SIMD accesses to be
16-byte-aligned. So for now, I'm thinking of just aligning all strings
to 16 bytes on x86_64. This is indeed wasteful, but implementation-wise
it's simpler than preserving per-string alignment+offsets. It also
avoids the aforementioned crash after deduplication of
differently-aligned strings. Finally, the overhead is not huge: using
16-byte alignment (vs no alignment) is only a 0.5% size overhead when
linking chromium_framework.
With these alignment requirements, it doesn't make sense to attempt tail
merging -- most strings will not be eligible since their overlaps aren't
likely to start at a 16-byte boundary. Tail-merging (with alignment) for
chromium_framework only improves size by 0.3%.
It's worth noting that LLD-ELF only does tail merging at `-O2`. By
default (at `-O1`), it just deduplicates w/o tail merging. @thakis has
also mentioned that they saw it regress compressed size in some cases
and therefore turned it off. `ld64` does not seem to do tail merging at
all.
**Performance Numbers**
CString deduplication reduces chromium_framework from 250MB to 242MB, or
about a 3.2% reduction.
Numbers for linking chromium_framework on my 3.2 GHz 16-Core Intel Xeon W:
N Min Max Median Avg Stddev
x 20 3.91 4.03 3.935 3.95 0.034641016
+ 20 3.99 4.14 4.015 4.0365 0.0492336
Difference at 95.0% confidence
0.0865 +/- 0.027245
2.18987% +/- 0.689746%
(Student's t, pooled s = 0.0425673)
As expected, cstring merging incurs some non-trivial overhead.
When passing `--no-literal-merge`, it seems that performance is the
same, i.e. the refactoring in this diff didn't cost us.
N Min Max Median Avg Stddev
x 20 3.91 4.03 3.935 3.95 0.034641016
+ 20 3.89 4.02 3.935 3.9435 0.043197831
No difference proven at 95.0% confidence
Reviewed By: #lld-macho, gkm
Differential Revision: https://reviews.llvm.org/D102964
2021-06-08 11:47:12 +08:00
|
|
|
return a.isec->getVA(a.offset) < b.isec->getVA(b.offset);
|
2020-09-06 01:55:33 +08:00
|
|
|
});
|
2021-03-13 06:26:12 +08:00
|
|
|
for (const Location &loc : locations)
|
[lld-macho] Implement cstring deduplication
Our implementation draws heavily from LLD-ELF's, which in turn delegates
its string deduplication to llvm-mc's StringTableBuilder. The messiness of
this diff is largely due to the fact that we've previously assumed that
all InputSections get concatenated together to form the output. This is
no longer true with CStringInputSections, which split their contents into
StringPieces. StringPieces are much more lightweight than InputSections,
which is important as we create a lot of them. They may also overlap in
the output, which makes it possible for strings to be tail-merged. In
fact, the initial version of this diff implemented tail merging, but
I've dropped it for reasons I'll explain later.
**Alignment Issues**
Mergeable cstring literals are found under the `__TEXT,__cstring`
section. In contrast to ELF, which puts strings that need different
alignments into different sections, clang's Mach-O backend puts them all
in one section. Strings that need to be aligned have the `.p2align`
directive emitted before them, which simply translates into zero padding
in the object file.
I *think* ld64 extracts the desired per-string alignment from this data
by preserving each string's offset from the last section-aligned
address. I'm not entirely certain since it doesn't seem consistent about
doing this; but perhaps this can be chalked up to cases where ld64 has
to deduplicate strings with different offset/alignment combos -- it
seems to pick one of their alignments to preserve. This doesn't seem
correct in general; we can in fact can induce ld64 to produce a crashing
binary just by linking in an additional object file that only contains
cstrings and no code. See PR50563 for details.
Moreover, this scheme seems rather inefficient: since unaligned and
aligned strings are all put in the same section, which has a single
alignment value, it doesn't seem possible to tell whether a given string
doesn't have any alignment requirements. Preserving offset+alignments
for strings that don't need it is wasteful.
In practice, the crashes seen so far seem to stem from x86_64 SIMD
operations on cstrings. X86_64 requires SIMD accesses to be
16-byte-aligned. So for now, I'm thinking of just aligning all strings
to 16 bytes on x86_64. This is indeed wasteful, but implementation-wise
it's simpler than preserving per-string alignment+offsets. It also
avoids the aforementioned crash after deduplication of
differently-aligned strings. Finally, the overhead is not huge: using
16-byte alignment (vs no alignment) is only a 0.5% size overhead when
linking chromium_framework.
With these alignment requirements, it doesn't make sense to attempt tail
merging -- most strings will not be eligible since their overlaps aren't
likely to start at a 16-byte boundary. Tail-merging (with alignment) for
chromium_framework only improves size by 0.3%.
It's worth noting that LLD-ELF only does tail merging at `-O2`. By
default (at `-O1`), it just deduplicates w/o tail merging. @thakis has
also mentioned that they saw it regress compressed size in some cases
and therefore turned it off. `ld64` does not seem to do tail merging at
all.
**Performance Numbers**
CString deduplication reduces chromium_framework from 250MB to 242MB, or
about a 3.2% reduction.
Numbers for linking chromium_framework on my 3.2 GHz 16-Core Intel Xeon W:
N Min Max Median Avg Stddev
x 20 3.91 4.03 3.935 3.95 0.034641016
+ 20 3.99 4.14 4.015 4.0365 0.0492336
Difference at 95.0% confidence
0.0865 +/- 0.027245
2.18987% +/- 0.689746%
(Student's t, pooled s = 0.0425673)
As expected, cstring merging incurs some non-trivial overhead.
When passing `--no-literal-merge`, it seems that performance is the
same, i.e. the refactoring in this diff didn't cost us.
N Min Max Median Avg Stddev
x 20 3.91 4.03 3.935 3.95 0.034641016
+ 20 3.89 4.02 3.935 3.9435 0.043197831
No difference proven at 95.0% confidence
Reviewed By: #lld-macho, gkm
Differential Revision: https://reviews.llvm.org/D102964
2021-06-08 11:47:12 +08:00
|
|
|
encodeRebase(loc.isec->parent, loc.isec->getOffset(loc.offset), lastRebase,
|
2021-03-13 06:26:12 +08:00
|
|
|
os);
|
2020-09-06 01:55:33 +08:00
|
|
|
if (lastRebase.consecutiveCount != 0)
|
|
|
|
encodeDoRebase(lastRebase, os);
|
|
|
|
|
|
|
|
os << static_cast<uint8_t>(REBASE_OPCODE_DONE);
|
|
|
|
}
|
|
|
|
|
|
|
|
void RebaseSection::writeTo(uint8_t *buf) const {
|
|
|
|
memcpy(buf, contents.data(), contents.size());
|
|
|
|
}
|
|
|
|
|
2020-08-13 10:50:09 +08:00
|
|
|
NonLazyPointerSectionBase::NonLazyPointerSectionBase(const char *segname,
|
|
|
|
const char *name)
|
|
|
|
: SyntheticSection(segname, name) {
|
2021-04-03 06:46:18 +08:00
|
|
|
align = target->wordSize;
|
2020-04-22 04:37:57 +08:00
|
|
|
}
|
|
|
|
|
2021-03-15 06:35:27 +08:00
|
|
|
void macho::addNonLazyBindingEntries(const Symbol *sym,
|
|
|
|
const InputSection *isec, uint64_t offset,
|
|
|
|
int64_t addend) {
|
|
|
|
if (const auto *dysym = dyn_cast<DylibSymbol>(sym)) {
|
|
|
|
in.binding->addEntry(dysym, isec, offset, addend);
|
|
|
|
if (dysym->isWeakDef())
|
|
|
|
in.weakBinding->addEntry(sym, isec, offset, addend);
|
|
|
|
} else if (const auto *defined = dyn_cast<Defined>(sym)) {
|
|
|
|
in.rebase->addEntry(isec, offset);
|
|
|
|
if (defined->isExternalWeakDef())
|
|
|
|
in.weakBinding->addEntry(sym, isec, offset, addend);
|
2022-03-15 09:51:15 +08:00
|
|
|
else if (defined->interposable)
|
|
|
|
in.binding->addEntry(sym, isec, offset, addend);
|
2021-03-15 06:35:27 +08:00
|
|
|
} else {
|
|
|
|
// Undefined symbols are filtered out in scanRelocations(); we should never
|
|
|
|
// get here
|
|
|
|
llvm_unreachable("cannot bind to an undefined symbol");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-21 05:45:51 +08:00
|
|
|
void NonLazyPointerSectionBase::addEntry(Symbol *sym) {
|
|
|
|
if (entries.insert(sym)) {
|
2020-08-25 12:57:59 +08:00
|
|
|
assert(!sym->isInGot());
|
2020-08-21 05:45:51 +08:00
|
|
|
sym->gotIndex = entries.size() - 1;
|
|
|
|
|
2021-04-03 06:46:18 +08:00
|
|
|
addNonLazyBindingEntries(sym, isec, sym->gotIndex * target->wordSize);
|
2020-04-22 04:37:57 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-13 10:50:09 +08:00
|
|
|
void NonLazyPointerSectionBase::writeTo(uint8_t *buf) const {
|
2020-06-14 11:00:06 +08:00
|
|
|
for (size_t i = 0, n = entries.size(); i < n; ++i)
|
|
|
|
if (auto *defined = dyn_cast<Defined>(entries[i]))
|
2021-04-03 06:46:18 +08:00
|
|
|
write64le(&buf[i * target->wordSize], defined->getVA());
|
2020-06-14 11:00:06 +08:00
|
|
|
}
|
|
|
|
|
2021-06-19 21:54:11 +08:00
|
|
|
GotSection::GotSection()
|
2021-10-26 23:36:09 +08:00
|
|
|
: NonLazyPointerSectionBase(segment_names::data, section_names::got) {
|
2021-06-19 21:54:11 +08:00
|
|
|
flags = S_NON_LAZY_SYMBOL_POINTERS;
|
|
|
|
}
|
|
|
|
|
|
|
|
TlvPointerSection::TlvPointerSection()
|
|
|
|
: NonLazyPointerSectionBase(segment_names::data,
|
|
|
|
section_names::threadPtrs) {
|
|
|
|
flags = S_THREAD_LOCAL_VARIABLE_POINTERS;
|
|
|
|
}
|
|
|
|
|
2020-05-02 07:29:06 +08:00
|
|
|
BindingSection::BindingSection()
|
2020-07-31 05:29:14 +08:00
|
|
|
: LinkEditSection(segment_names::linkEdit, section_names::binding) {}
|
2020-04-28 03:50:59 +08:00
|
|
|
|
2020-07-03 12:19:55 +08:00
|
|
|
namespace {
|
|
|
|
struct Binding {
|
|
|
|
OutputSegment *segment = nullptr;
|
|
|
|
uint64_t offset = 0;
|
|
|
|
int64_t addend = 0;
|
|
|
|
};
|
2021-07-10 08:25:29 +08:00
|
|
|
struct BindIR {
|
|
|
|
// Default value of 0xF0 is not valid opcode and should make the program
|
|
|
|
// scream instead of accidentally writing "valid" values.
|
|
|
|
uint8_t opcode = 0xF0;
|
|
|
|
uint64_t data = 0;
|
2021-07-16 09:29:05 +08:00
|
|
|
uint64_t consecutiveCount = 0;
|
2021-07-10 08:25:29 +08:00
|
|
|
};
|
2020-07-03 12:19:55 +08:00
|
|
|
} // namespace
|
|
|
|
|
2020-08-25 12:57:59 +08:00
|
|
|
// Encode a sequence of opcodes that tell dyld to write the address of symbol +
|
2020-07-03 12:19:55 +08:00
|
|
|
// addend at osec->addr + outSecOff.
|
|
|
|
//
|
|
|
|
// The bind opcode "interpreter" remembers the values of each binding field, so
|
|
|
|
// we only need to encode the differences between bindings. Hence the use of
|
|
|
|
// lastBinding.
|
2021-07-06 08:00:09 +08:00
|
|
|
static void encodeBinding(const OutputSection *osec, uint64_t outSecOff,
|
|
|
|
int64_t addend, Binding &lastBinding,
|
2021-07-10 08:25:29 +08:00
|
|
|
std::vector<BindIR> &opcodes) {
|
2020-07-03 12:19:55 +08:00
|
|
|
OutputSegment *seg = osec->parent;
|
|
|
|
uint64_t offset = osec->getSegmentOffset() + outSecOff;
|
|
|
|
if (lastBinding.segment != seg) {
|
2021-07-16 09:29:05 +08:00
|
|
|
opcodes.push_back(
|
|
|
|
{static_cast<uint8_t>(BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB |
|
|
|
|
seg->index),
|
|
|
|
offset});
|
2020-07-03 12:19:55 +08:00
|
|
|
lastBinding.segment = seg;
|
|
|
|
lastBinding.offset = offset;
|
|
|
|
} else if (lastBinding.offset != offset) {
|
2021-07-16 09:29:05 +08:00
|
|
|
opcodes.push_back({BIND_OPCODE_ADD_ADDR_ULEB, offset - lastBinding.offset});
|
2020-07-03 12:19:55 +08:00
|
|
|
lastBinding.offset = offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (lastBinding.addend != addend) {
|
2021-07-16 09:29:05 +08:00
|
|
|
opcodes.push_back(
|
|
|
|
{BIND_OPCODE_SET_ADDEND_SLEB, static_cast<uint64_t>(addend)});
|
2020-07-03 12:19:55 +08:00
|
|
|
lastBinding.addend = addend;
|
|
|
|
}
|
|
|
|
|
2021-07-16 09:29:05 +08:00
|
|
|
opcodes.push_back({BIND_OPCODE_DO_BIND, 0});
|
2020-07-03 12:19:55 +08:00
|
|
|
// DO_BIND causes dyld to both perform the binding and increment the offset
|
2021-04-03 06:46:18 +08:00
|
|
|
lastBinding.offset += target->wordSize;
|
2020-07-03 12:19:55 +08:00
|
|
|
}
|
2020-04-28 03:50:59 +08:00
|
|
|
|
2021-07-16 09:29:05 +08:00
|
|
|
static void optimizeOpcodes(std::vector<BindIR> &opcodes) {
|
|
|
|
// Pass 1: Combine bind/add pairs
|
|
|
|
size_t i;
|
|
|
|
int pWrite = 0;
|
|
|
|
for (i = 1; i < opcodes.size(); ++i, ++pWrite) {
|
|
|
|
if ((opcodes[i].opcode == BIND_OPCODE_ADD_ADDR_ULEB) &&
|
|
|
|
(opcodes[i - 1].opcode == BIND_OPCODE_DO_BIND)) {
|
|
|
|
opcodes[pWrite].opcode = BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB;
|
|
|
|
opcodes[pWrite].data = opcodes[i].data;
|
|
|
|
++i;
|
|
|
|
} else {
|
|
|
|
opcodes[pWrite] = opcodes[i - 1];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (i == opcodes.size())
|
|
|
|
opcodes[pWrite] = opcodes[i - 1];
|
|
|
|
opcodes.resize(pWrite + 1);
|
|
|
|
|
|
|
|
// Pass 2: Compress two or more bind_add opcodes
|
|
|
|
pWrite = 0;
|
|
|
|
for (i = 1; i < opcodes.size(); ++i, ++pWrite) {
|
|
|
|
if ((opcodes[i].opcode == BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB) &&
|
|
|
|
(opcodes[i - 1].opcode == BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB) &&
|
|
|
|
(opcodes[i].data == opcodes[i - 1].data)) {
|
|
|
|
opcodes[pWrite].opcode = BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB;
|
|
|
|
opcodes[pWrite].consecutiveCount = 2;
|
|
|
|
opcodes[pWrite].data = opcodes[i].data;
|
|
|
|
++i;
|
|
|
|
while (i < opcodes.size() &&
|
|
|
|
(opcodes[i].opcode == BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB) &&
|
|
|
|
(opcodes[i].data == opcodes[i - 1].data)) {
|
|
|
|
opcodes[pWrite].consecutiveCount++;
|
|
|
|
++i;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
opcodes[pWrite] = opcodes[i - 1];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (i == opcodes.size())
|
|
|
|
opcodes[pWrite] = opcodes[i - 1];
|
|
|
|
opcodes.resize(pWrite + 1);
|
2021-07-20 09:52:45 +08:00
|
|
|
|
|
|
|
// Pass 3: Use immediate encodings
|
|
|
|
// Every binding is the size of one pointer. If the next binding is a
|
|
|
|
// multiple of wordSize away that is within BIND_IMMEDIATE_MASK, the
|
|
|
|
// opcode can be scaled by wordSize into a single byte and dyld will
|
|
|
|
// expand it to the correct address.
|
|
|
|
for (auto &p : opcodes) {
|
|
|
|
// It's unclear why the check needs to be less than BIND_IMMEDIATE_MASK,
|
|
|
|
// but ld64 currently does this. This could be a potential bug, but
|
|
|
|
// for now, perform the same behavior to prevent mysterious bugs.
|
|
|
|
if ((p.opcode == BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB) &&
|
|
|
|
((p.data / target->wordSize) < BIND_IMMEDIATE_MASK) &&
|
|
|
|
((p.data % target->wordSize) == 0)) {
|
|
|
|
p.opcode = BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED;
|
|
|
|
p.data /= target->wordSize;
|
|
|
|
}
|
|
|
|
}
|
2021-07-16 09:29:05 +08:00
|
|
|
}
|
|
|
|
|
2021-07-10 08:25:29 +08:00
|
|
|
static void flushOpcodes(const BindIR &op, raw_svector_ostream &os) {
|
|
|
|
uint8_t opcode = op.opcode & BIND_OPCODE_MASK;
|
|
|
|
switch (opcode) {
|
|
|
|
case BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB:
|
|
|
|
case BIND_OPCODE_ADD_ADDR_ULEB:
|
2021-07-16 09:29:05 +08:00
|
|
|
case BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB:
|
2021-07-10 08:25:29 +08:00
|
|
|
os << op.opcode;
|
|
|
|
encodeULEB128(op.data, os);
|
|
|
|
break;
|
|
|
|
case BIND_OPCODE_SET_ADDEND_SLEB:
|
|
|
|
os << op.opcode;
|
|
|
|
encodeSLEB128(static_cast<int64_t>(op.data), os);
|
|
|
|
break;
|
|
|
|
case BIND_OPCODE_DO_BIND:
|
|
|
|
os << op.opcode;
|
|
|
|
break;
|
2021-07-16 09:29:05 +08:00
|
|
|
case BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB:
|
|
|
|
os << op.opcode;
|
|
|
|
encodeULEB128(op.consecutiveCount, os);
|
|
|
|
encodeULEB128(op.data, os);
|
|
|
|
break;
|
2021-07-20 09:52:45 +08:00
|
|
|
case BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED:
|
|
|
|
os << static_cast<uint8_t>(op.opcode | op.data);
|
|
|
|
break;
|
2021-07-10 08:25:29 +08:00
|
|
|
default:
|
|
|
|
llvm_unreachable("cannot bind to an unrecognized symbol");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-25 12:57:59 +08:00
|
|
|
// Non-weak bindings need to have their dylib ordinal encoded as well.
|
2021-02-27 08:13:48 +08:00
|
|
|
static int16_t ordinalForDylibSymbol(const DylibSymbol &dysym) {
|
2021-06-01 10:12:35 +08:00
|
|
|
if (config->namespaceKind == NamespaceKind::flat || dysym.isDynamicLookup())
|
|
|
|
return static_cast<int16_t>(BIND_SPECIAL_DYLIB_FLAT_LOOKUP);
|
|
|
|
assert(dysym.getFile()->isReferenced());
|
|
|
|
return dysym.getFile()->ordinal;
|
2021-02-27 08:13:48 +08:00
|
|
|
}
|
2021-02-26 08:56:31 +08:00
|
|
|
|
2022-03-15 09:51:11 +08:00
|
|
|
static int16_t ordinalForSymbol(const Symbol &sym) {
|
|
|
|
if (const auto *dysym = dyn_cast<DylibSymbol>(&sym))
|
|
|
|
return ordinalForDylibSymbol(*dysym);
|
2022-03-15 09:51:15 +08:00
|
|
|
assert(cast<Defined>(&sym)->interposable);
|
2022-03-15 09:51:11 +08:00
|
|
|
return BIND_SPECIAL_DYLIB_FLAT_LOOKUP;
|
|
|
|
}
|
|
|
|
|
2021-02-27 08:13:48 +08:00
|
|
|
static void encodeDylibOrdinal(int16_t ordinal, raw_svector_ostream &os) {
|
|
|
|
if (ordinal <= 0) {
|
|
|
|
os << static_cast<uint8_t>(BIND_OPCODE_SET_DYLIB_SPECIAL_IMM |
|
|
|
|
(ordinal & BIND_IMMEDIATE_MASK));
|
|
|
|
} else if (ordinal <= BIND_IMMEDIATE_MASK) {
|
|
|
|
os << static_cast<uint8_t>(BIND_OPCODE_SET_DYLIB_ORDINAL_IMM | ordinal);
|
|
|
|
} else {
|
|
|
|
os << static_cast<uint8_t>(BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB);
|
|
|
|
encodeULEB128(ordinal, os);
|
2020-08-25 12:57:59 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-28 06:59:30 +08:00
|
|
|
static void encodeWeakOverride(const Defined *defined,
|
|
|
|
raw_svector_ostream &os) {
|
|
|
|
os << static_cast<uint8_t>(BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM |
|
|
|
|
BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION)
|
|
|
|
<< defined->getName() << '\0';
|
|
|
|
}
|
|
|
|
|
2021-07-06 08:00:09 +08:00
|
|
|
// Organize the bindings so we can encoded them with fewer opcodes.
|
|
|
|
//
|
|
|
|
// First, all bindings for a given symbol should be grouped together.
|
|
|
|
// BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM is the largest opcode (since it
|
|
|
|
// has an associated symbol string), so we only want to emit it once per symbol.
|
|
|
|
//
|
|
|
|
// Within each group, we sort the bindings by address. Since bindings are
|
|
|
|
// delta-encoded, sorting them allows for a more compact result. Note that
|
|
|
|
// sorting by address alone ensures that bindings for the same segment / section
|
|
|
|
// are located together, minimizing the number of times we have to emit
|
|
|
|
// BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB.
|
|
|
|
//
|
|
|
|
// Finally, we sort the symbols by the address of their first binding, again
|
|
|
|
// to facilitate the delta-encoding process.
|
|
|
|
template <class Sym>
|
|
|
|
std::vector<std::pair<const Sym *, std::vector<BindingEntry>>>
|
|
|
|
sortBindings(const BindingsMap<const Sym *> &bindingsMap) {
|
|
|
|
std::vector<std::pair<const Sym *, std::vector<BindingEntry>>> bindingsVec(
|
|
|
|
bindingsMap.begin(), bindingsMap.end());
|
|
|
|
for (auto &p : bindingsVec) {
|
|
|
|
std::vector<BindingEntry> &bindings = p.second;
|
|
|
|
llvm::sort(bindings, [](const BindingEntry &a, const BindingEntry &b) {
|
|
|
|
return a.target.getVA() < b.target.getVA();
|
|
|
|
});
|
|
|
|
}
|
|
|
|
llvm::sort(bindingsVec, [](const auto &a, const auto &b) {
|
|
|
|
return a.second[0].target.getVA() < b.second[0].target.getVA();
|
|
|
|
});
|
|
|
|
return bindingsVec;
|
|
|
|
}
|
|
|
|
|
2020-04-28 03:50:59 +08:00
|
|
|
// Emit bind opcodes, which are a stream of byte-sized opcodes that dyld
|
|
|
|
// interprets to update a record with the following fields:
|
|
|
|
// * segment index (of the segment to write the symbol addresses to, typically
|
|
|
|
// the __DATA_CONST segment which contains the GOT)
|
|
|
|
// * offset within the segment, indicating the next location to write a binding
|
|
|
|
// * symbol type
|
|
|
|
// * symbol library ordinal (the index of its library's LC_LOAD_DYLIB command)
|
|
|
|
// * symbol name
|
|
|
|
// * addend
|
|
|
|
// When dyld sees BIND_OPCODE_DO_BIND, it uses the current record state to bind
|
|
|
|
// a symbol in the GOT, and increments the segment offset to point to the next
|
|
|
|
// entry. It does *not* clear the record state after doing the bind, so
|
|
|
|
// subsequent opcodes only need to encode the differences between bindings.
|
|
|
|
void BindingSection::finalizeContents() {
|
|
|
|
raw_svector_ostream os{contents};
|
2020-07-03 12:19:55 +08:00
|
|
|
Binding lastBinding;
|
2021-07-06 08:00:09 +08:00
|
|
|
int16_t lastOrdinal = 0;
|
|
|
|
|
|
|
|
for (auto &p : sortBindings(bindingsMap)) {
|
2022-03-15 09:51:11 +08:00
|
|
|
const Symbol *sym = p.first;
|
2021-07-06 08:00:09 +08:00
|
|
|
std::vector<BindingEntry> &bindings = p.second;
|
|
|
|
uint8_t flags = BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM;
|
|
|
|
if (sym->isWeakRef())
|
|
|
|
flags |= BIND_SYMBOL_FLAGS_WEAK_IMPORT;
|
|
|
|
os << flags << sym->getName() << '\0'
|
|
|
|
<< static_cast<uint8_t>(BIND_OPCODE_SET_TYPE_IMM | BIND_TYPE_POINTER);
|
2022-03-15 09:51:11 +08:00
|
|
|
int16_t ordinal = ordinalForSymbol(*sym);
|
2021-07-06 08:00:09 +08:00
|
|
|
if (ordinal != lastOrdinal) {
|
2021-02-27 08:13:48 +08:00
|
|
|
encodeDylibOrdinal(ordinal, os);
|
2021-07-06 08:00:09 +08:00
|
|
|
lastOrdinal = ordinal;
|
2021-02-27 08:13:48 +08:00
|
|
|
}
|
2021-07-10 08:25:29 +08:00
|
|
|
std::vector<BindIR> opcodes;
|
2021-07-06 08:00:09 +08:00
|
|
|
for (const BindingEntry &b : bindings)
|
|
|
|
encodeBinding(b.target.isec->parent,
|
|
|
|
b.target.isec->getOffset(b.target.offset), b.addend,
|
2021-07-10 08:25:29 +08:00
|
|
|
lastBinding, opcodes);
|
2021-07-16 09:29:05 +08:00
|
|
|
if (config->optimize > 1)
|
|
|
|
optimizeOpcodes(opcodes);
|
2021-07-10 08:25:29 +08:00
|
|
|
for (const auto &op : opcodes)
|
|
|
|
flushOpcodes(op, os);
|
2020-07-03 12:19:55 +08:00
|
|
|
}
|
2021-07-06 08:00:09 +08:00
|
|
|
if (!bindingsMap.empty())
|
2021-03-12 02:28:08 +08:00
|
|
|
os << static_cast<uint8_t>(BIND_OPCODE_DONE);
|
2020-04-28 03:50:59 +08:00
|
|
|
}
|
|
|
|
|
2020-05-02 07:29:06 +08:00
|
|
|
void BindingSection::writeTo(uint8_t *buf) const {
|
2020-04-28 03:50:59 +08:00
|
|
|
memcpy(buf, contents.data(), contents.size());
|
|
|
|
}
|
|
|
|
|
2020-08-25 12:57:59 +08:00
|
|
|
WeakBindingSection::WeakBindingSection()
|
|
|
|
: LinkEditSection(segment_names::linkEdit, section_names::weakBinding) {}
|
|
|
|
|
|
|
|
void WeakBindingSection::finalizeContents() {
|
|
|
|
raw_svector_ostream os{contents};
|
|
|
|
Binding lastBinding;
|
|
|
|
|
2020-08-28 06:59:30 +08:00
|
|
|
for (const Defined *defined : definitions)
|
|
|
|
encodeWeakOverride(defined, os);
|
|
|
|
|
2021-07-06 08:00:09 +08:00
|
|
|
for (auto &p : sortBindings(bindingsMap)) {
|
|
|
|
const Symbol *sym = p.first;
|
|
|
|
std::vector<BindingEntry> &bindings = p.second;
|
|
|
|
os << static_cast<uint8_t>(BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM)
|
|
|
|
<< sym->getName() << '\0'
|
|
|
|
<< static_cast<uint8_t>(BIND_OPCODE_SET_TYPE_IMM | BIND_TYPE_POINTER);
|
2021-07-10 08:25:29 +08:00
|
|
|
std::vector<BindIR> opcodes;
|
2021-07-06 08:00:09 +08:00
|
|
|
for (const BindingEntry &b : bindings)
|
|
|
|
encodeBinding(b.target.isec->parent,
|
|
|
|
b.target.isec->getOffset(b.target.offset), b.addend,
|
2021-07-10 08:25:29 +08:00
|
|
|
lastBinding, opcodes);
|
2021-07-16 09:29:05 +08:00
|
|
|
if (config->optimize > 1)
|
|
|
|
optimizeOpcodes(opcodes);
|
2021-07-10 08:25:29 +08:00
|
|
|
for (const auto &op : opcodes)
|
|
|
|
flushOpcodes(op, os);
|
2021-07-06 08:00:09 +08:00
|
|
|
}
|
|
|
|
if (!bindingsMap.empty() || !definitions.empty())
|
2021-03-12 02:28:08 +08:00
|
|
|
os << static_cast<uint8_t>(BIND_OPCODE_DONE);
|
2020-08-25 12:57:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void WeakBindingSection::writeTo(uint8_t *buf) const {
|
|
|
|
memcpy(buf, contents.data(), contents.size());
|
|
|
|
}
|
|
|
|
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
StubsSection::StubsSection()
|
2021-04-28 03:22:44 +08:00
|
|
|
: SyntheticSection(segment_names::text, section_names::stubs) {
|
2021-03-12 02:28:08 +08:00
|
|
|
flags = S_SYMBOL_STUBS | S_ATTR_SOME_INSTRUCTIONS | S_ATTR_PURE_INSTRUCTIONS;
|
2021-02-23 01:06:58 +08:00
|
|
|
// The stubs section comprises machine instructions, which are aligned to
|
|
|
|
// 4 bytes on the archs we care about.
|
|
|
|
align = 4;
|
2020-09-05 09:02:07 +08:00
|
|
|
reserved2 = target->stubSize;
|
|
|
|
}
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
|
2020-06-17 08:27:28 +08:00
|
|
|
uint64_t StubsSection::getSize() const {
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
return entries.size() * target->stubSize;
|
|
|
|
}
|
|
|
|
|
|
|
|
void StubsSection::writeTo(uint8_t *buf) const {
|
|
|
|
size_t off = 0;
|
2020-08-28 06:54:42 +08:00
|
|
|
for (const Symbol *sym : entries) {
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
target->writeStub(buf + off, *sym);
|
|
|
|
off += target->stubSize;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-30 08:33:48 +08:00
|
|
|
void StubsSection::finalize() { isFinal = true; }
|
|
|
|
|
2020-08-28 06:54:42 +08:00
|
|
|
bool StubsSection::addEntry(Symbol *sym) {
|
|
|
|
bool inserted = entries.insert(sym);
|
|
|
|
if (inserted)
|
2020-08-21 05:45:51 +08:00
|
|
|
sym->stubsIndex = entries.size() - 1;
|
2020-08-28 06:54:42 +08:00
|
|
|
return inserted;
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
StubHelperSection::StubHelperSection()
|
2021-04-28 03:22:44 +08:00
|
|
|
: SyntheticSection(segment_names::text, section_names::stubHelper) {
|
2021-03-12 02:28:08 +08:00
|
|
|
flags = S_ATTR_SOME_INSTRUCTIONS | S_ATTR_PURE_INSTRUCTIONS;
|
2021-02-23 01:06:58 +08:00
|
|
|
align = 4; // This section comprises machine instructions
|
2021-01-03 02:31:55 +08:00
|
|
|
}
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
|
2020-06-17 08:27:28 +08:00
|
|
|
uint64_t StubHelperSection::getSize() const {
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
return target->stubHelperHeaderSize +
|
2020-08-28 06:54:42 +08:00
|
|
|
in.lazyBinding->getEntries().size() * target->stubHelperEntrySize;
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
}
|
|
|
|
|
2020-08-28 06:54:42 +08:00
|
|
|
bool StubHelperSection::isNeeded() const { return in.lazyBinding->isNeeded(); }
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
|
|
|
|
void StubHelperSection::writeTo(uint8_t *buf) const {
|
|
|
|
target->writeStubHelperHeader(buf);
|
|
|
|
size_t off = target->stubHelperHeaderSize;
|
2022-03-15 09:51:11 +08:00
|
|
|
for (const Symbol *sym : in.lazyBinding->getEntries()) {
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
target->writeStubHelperEntry(buf + off, *sym, addr + off);
|
|
|
|
off += target->stubHelperEntrySize;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void StubHelperSection::setup() {
|
2021-07-12 00:35:38 +08:00
|
|
|
Symbol *binder = symtab->addUndefined("dyld_stub_binder", /*file=*/nullptr,
|
|
|
|
/*isWeakRef=*/false);
|
|
|
|
if (auto *undefined = dyn_cast<Undefined>(binder))
|
|
|
|
treatUndefinedSymbol(*undefined,
|
|
|
|
"lazy binding (normally in libSystem.dylib)");
|
|
|
|
|
|
|
|
// treatUndefinedSymbol() can replace binder with a DylibSymbol; re-check.
|
|
|
|
stubBinder = dyn_cast_or_null<DylibSymbol>(binder);
|
|
|
|
if (stubBinder == nullptr)
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
return;
|
2021-07-12 00:35:38 +08:00
|
|
|
|
2020-08-21 05:45:51 +08:00
|
|
|
in.got->addEntry(stubBinder);
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
|
[lld-macho] Have ICF operate on all sections at once
ICF previously operated only within a given OutputSection. We would
merge all CFStrings first, then merge all regular code sections in a
second phase. This worked fine since CFStrings would never reference
regular `__text` sections. However, I would like to expand ICF to merge
functions that reference unwind info. Unwind info references the LSDA
section, which can in turn reference the `__text` section, so we cannot
perform ICF in phases.
In order to have ICF operate on InputSections spanning multiple
OutputSections, we need a way to distinguish InputSections that are
destined for different OutputSections, so that we don't fold across
section boundaries. We achieve this by creating OutputSections early,
and setting `InputSection::parent` to point to them. This is what
LLD-ELF does. (This change should also make it easier to implement the
`section$start$` symbols.)
This diff also folds InputSections w/o checking their flags, which I
think is the right behavior -- if they are destined for the same
OutputSection, they will have the same flags in the output (even if
their input flags differ). I.e. the `parent` pointer check subsumes the
`flags` check. In practice this has nearly no effect (ICF did not become
any more effective on chromium_framework).
I've also updated ICF.cpp's block comment to better reflect its current
status.
Reviewed By: #lld-macho, smeenai
Differential Revision: https://reviews.llvm.org/D105641
2021-07-18 01:42:26 +08:00
|
|
|
in.imageLoaderCache->parent =
|
|
|
|
ConcatOutputSection::getOrCreateForInput(in.imageLoaderCache);
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
inputSections.push_back(in.imageLoaderCache);
|
[lld/mac] Implement -dead_strip
Also adds support for live_support sections, no_dead_strip sections,
.no_dead_strip symbols.
Chromium Framework 345MB unstripped -> 250MB stripped
(vs 290MB unstripped -> 236M stripped with ld64).
Doing dead stripping is a bit faster than not, because so much less
data needs to be processed:
% ministat lld_*
x lld_nostrip.txt
+ lld_strip.txt
N Min Max Median Avg Stddev
x 10 3.929414 4.07692 4.0269079 4.0089678 0.044214794
+ 10 3.8129408 3.9025559 3.8670411 3.8642573 0.024779651
Difference at 95.0% confidence
-0.144711 +/- 0.0336749
-3.60967% +/- 0.839989%
(Student's t, pooled s = 0.0358398)
This interacts with many parts of the linker. I tried to add test coverage
for all added `isLive()` checks, so that some test will fail if any of them
is removed. I checked that the test expectations for the most part match
ld64's behavior (except for live-support-iterations.s, see the comment
in the test). Interacts with:
- debug info
- export tries
- import opcodes
- flags like -exported_symbol(s_list)
- -U / dynamic_lookup
- mod_init_funcs, mod_term_funcs
- weak symbol handling
- unwind info
- stubs
- map files
- -sectcreate
- undefined, dylib, common, defined (both absolute and normal) symbols
It's possible it interacts with more features I didn't think of,
of course.
I also did some manual testing:
- check-llvm check-clang check-lld work with lld with this patch
as host linker and -dead_strip enabled
- Chromium still starts
- Chromium's base_unittests still pass, including unwind tests
Implemenation-wise, this is InputSection-based, so it'll work for
object files with .subsections_via_symbols (which includes all
object files generated by clang). I first based this on the COFF
implementation, but later realized that things are more similar to ELF.
I think it'd be good to refactor MarkLive.cpp to look more like the ELF
part at some point, but I'd like to get a working state checked in first.
Mechanical parts:
- Rename canOmitFromOutput to wasCoalesced (no behavior change)
since it really is for weak coalesced symbols
- Add noDeadStrip to Defined, corresponding to N_NO_DEAD_STRIP
(`.no_dead_strip` in asm)
Fixes PR49276.
Differential Revision: https://reviews.llvm.org/D103324
2021-05-08 05:10:05 +08:00
|
|
|
// Since this isn't in the symbol table or in any input file, the noDeadStrip
|
[lld-macho] Associate compact unwind entries with function symbols
Compact unwind entries (CUEs) contain pointers to their respective
function symbols. However, during the link process, it's far more useful
to have pointers from the function symbol to the CUE than vice versa.
This diff adds that pointer in the form of `Defined::compactUnwind`.
In particular, when doing dead-stripping, we want to mark CUEs live when
their function symbol is live; and when doing ICF, we want to dedup
sections iff the symbols in that section have identical CUEs. In both
cases, we want to be able to locate the symbols within a given section,
as well as locate the CUEs belonging to those symbols. So this diff also
adds `InputSection::symbols`.
The ultimate goal of this refactor is to have ICF support dedup'ing
functions with unwind info, but that will be handled in subsequent
diffs. This diff focuses on simplifying `-dead_strip` --
`findFunctionsWithUnwindInfo` is no longer necessary, and
`Defined::isLive()` is now a lot simpler. Moreover, UnwindInfoSection no
longer has to check for dead CUEs -- we simply avoid adding them in the
first place.
Additionally, we now support stripping of dead LSDAs, which follows
quite naturally since `markLive()` can now reach them via the CUEs.
Reviewed By: #lld-macho, gkm
Differential Revision: https://reviews.llvm.org/D109944
2021-10-27 04:04:04 +08:00
|
|
|
// argument doesn't matter.
|
2021-04-02 08:48:09 +08:00
|
|
|
dyldPrivate =
|
|
|
|
make<Defined>("__dyld_private", nullptr, in.imageLoaderCache, 0, 0,
|
|
|
|
/*isWeakDef=*/false,
|
2021-05-01 04:17:26 +08:00
|
|
|
/*isExternal=*/false, /*isPrivateExtern=*/false,
|
2022-04-12 03:45:25 +08:00
|
|
|
/*includeInSymtab=*/true,
|
[lld/mac] Implement -dead_strip
Also adds support for live_support sections, no_dead_strip sections,
.no_dead_strip symbols.
Chromium Framework 345MB unstripped -> 250MB stripped
(vs 290MB unstripped -> 236M stripped with ld64).
Doing dead stripping is a bit faster than not, because so much less
data needs to be processed:
% ministat lld_*
x lld_nostrip.txt
+ lld_strip.txt
N Min Max Median Avg Stddev
x 10 3.929414 4.07692 4.0269079 4.0089678 0.044214794
+ 10 3.8129408 3.9025559 3.8670411 3.8642573 0.024779651
Difference at 95.0% confidence
-0.144711 +/- 0.0336749
-3.60967% +/- 0.839989%
(Student's t, pooled s = 0.0358398)
This interacts with many parts of the linker. I tried to add test coverage
for all added `isLive()` checks, so that some test will fail if any of them
is removed. I checked that the test expectations for the most part match
ld64's behavior (except for live-support-iterations.s, see the comment
in the test). Interacts with:
- debug info
- export tries
- import opcodes
- flags like -exported_symbol(s_list)
- -U / dynamic_lookup
- mod_init_funcs, mod_term_funcs
- weak symbol handling
- unwind info
- stubs
- map files
- -sectcreate
- undefined, dylib, common, defined (both absolute and normal) symbols
It's possible it interacts with more features I didn't think of,
of course.
I also did some manual testing:
- check-llvm check-clang check-lld work with lld with this patch
as host linker and -dead_strip enabled
- Chromium still starts
- Chromium's base_unittests still pass, including unwind tests
Implemenation-wise, this is InputSection-based, so it'll work for
object files with .subsections_via_symbols (which includes all
object files generated by clang). I first based this on the COFF
implementation, but later realized that things are more similar to ELF.
I think it'd be good to refactor MarkLive.cpp to look more like the ELF
part at some point, but I'd like to get a working state checked in first.
Mechanical parts:
- Rename canOmitFromOutput to wasCoalesced (no behavior change)
since it really is for weak coalesced symbols
- Add noDeadStrip to Defined, corresponding to N_NO_DEAD_STRIP
(`.no_dead_strip` in asm)
Fixes PR49276.
Differential Revision: https://reviews.llvm.org/D103324
2021-05-08 05:10:05 +08:00
|
|
|
/*isThumb=*/false, /*isReferencedDynamically=*/false,
|
|
|
|
/*noDeadStrip=*/false);
|
[lld-macho] Associate compact unwind entries with function symbols
Compact unwind entries (CUEs) contain pointers to their respective
function symbols. However, during the link process, it's far more useful
to have pointers from the function symbol to the CUE than vice versa.
This diff adds that pointer in the form of `Defined::compactUnwind`.
In particular, when doing dead-stripping, we want to mark CUEs live when
their function symbol is live; and when doing ICF, we want to dedup
sections iff the symbols in that section have identical CUEs. In both
cases, we want to be able to locate the symbols within a given section,
as well as locate the CUEs belonging to those symbols. So this diff also
adds `InputSection::symbols`.
The ultimate goal of this refactor is to have ICF support dedup'ing
functions with unwind info, but that will be handled in subsequent
diffs. This diff focuses on simplifying `-dead_strip` --
`findFunctionsWithUnwindInfo` is no longer necessary, and
`Defined::isLive()` is now a lot simpler. Moreover, UnwindInfoSection no
longer has to check for dead CUEs -- we simply avoid adding them in the
first place.
Additionally, we now support stripping of dead LSDAs, which follows
quite naturally since `markLive()` can now reach them via the CUEs.
Reviewed By: #lld-macho, gkm
Differential Revision: https://reviews.llvm.org/D109944
2021-10-27 04:04:04 +08:00
|
|
|
dyldPrivate->used = true;
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
LazyPointerSection::LazyPointerSection()
|
2021-04-28 03:22:44 +08:00
|
|
|
: SyntheticSection(segment_names::data, section_names::lazySymbolPtr) {
|
2021-04-03 06:46:18 +08:00
|
|
|
align = target->wordSize;
|
2021-03-12 02:28:08 +08:00
|
|
|
flags = S_LAZY_SYMBOL_POINTERS;
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
}
|
|
|
|
|
2020-06-17 08:27:28 +08:00
|
|
|
uint64_t LazyPointerSection::getSize() const {
|
2021-04-03 06:46:18 +08:00
|
|
|
return in.stubs->getEntries().size() * target->wordSize;
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool LazyPointerSection::isNeeded() const {
|
|
|
|
return !in.stubs->getEntries().empty();
|
|
|
|
}
|
|
|
|
|
|
|
|
void LazyPointerSection::writeTo(uint8_t *buf) const {
|
|
|
|
size_t off = 0;
|
2020-08-28 06:54:42 +08:00
|
|
|
for (const Symbol *sym : in.stubs->getEntries()) {
|
|
|
|
if (const auto *dysym = dyn_cast<DylibSymbol>(sym)) {
|
|
|
|
if (dysym->hasStubsHelper()) {
|
|
|
|
uint64_t stubHelperOffset =
|
|
|
|
target->stubHelperHeaderSize +
|
|
|
|
dysym->stubsHelperIndex * target->stubHelperEntrySize;
|
|
|
|
write64le(buf + off, in.stubHelper->addr + stubHelperOffset);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
write64le(buf + off, sym->getVA());
|
|
|
|
}
|
2021-04-03 06:46:18 +08:00
|
|
|
off += target->wordSize;
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
LazyBindingSection::LazyBindingSection()
|
2020-07-31 05:29:14 +08:00
|
|
|
: LinkEditSection(segment_names::linkEdit, section_names::lazyBinding) {}
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
|
|
|
|
void LazyBindingSection::finalizeContents() {
|
|
|
|
// TODO: Just precompute output size here instead of writing to a temporary
|
|
|
|
// buffer
|
2022-03-15 09:51:11 +08:00
|
|
|
for (Symbol *sym : entries)
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
sym->lazyBindOffset = encode(*sym);
|
|
|
|
}
|
|
|
|
|
|
|
|
void LazyBindingSection::writeTo(uint8_t *buf) const {
|
|
|
|
memcpy(buf, contents.data(), contents.size());
|
|
|
|
}
|
|
|
|
|
2022-03-15 09:51:11 +08:00
|
|
|
void LazyBindingSection::addEntry(Symbol *sym) {
|
|
|
|
if (entries.insert(sym)) {
|
|
|
|
sym->stubsHelperIndex = entries.size() - 1;
|
2021-04-03 06:46:18 +08:00
|
|
|
in.rebase->addEntry(in.lazyPointers->isec,
|
2022-03-15 09:51:11 +08:00
|
|
|
sym->stubsIndex * target->wordSize);
|
2020-09-06 01:55:33 +08:00
|
|
|
}
|
2020-08-28 06:54:42 +08:00
|
|
|
}
|
|
|
|
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
// Unlike the non-lazy binding section, the bind opcodes in this section aren't
|
|
|
|
// interpreted all at once. Rather, dyld will start interpreting opcodes at a
|
|
|
|
// given offset, typically only binding a single symbol before it finds a
|
|
|
|
// BIND_OPCODE_DONE terminator. As such, unlike in the non-lazy-binding case,
|
|
|
|
// we cannot encode just the differences between symbols; we have to emit the
|
|
|
|
// complete bind information for each symbol.
|
2022-03-15 09:51:11 +08:00
|
|
|
uint32_t LazyBindingSection::encode(const Symbol &sym) {
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
uint32_t opstreamOffset = contents.size();
|
|
|
|
OutputSegment *dataSeg = in.lazyPointers->parent;
|
2021-03-12 02:28:08 +08:00
|
|
|
os << static_cast<uint8_t>(BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB |
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
dataSeg->index);
|
2022-05-24 08:59:18 +08:00
|
|
|
uint64_t offset =
|
|
|
|
in.lazyPointers->addr - dataSeg->addr + sym.stubsIndex * target->wordSize;
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
encodeULEB128(offset, os);
|
2022-03-15 09:51:11 +08:00
|
|
|
encodeDylibOrdinal(ordinalForSymbol(sym), os);
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
|
2021-03-12 02:28:08 +08:00
|
|
|
uint8_t flags = BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM;
|
2020-12-16 10:05:06 +08:00
|
|
|
if (sym.isWeakRef())
|
2021-03-12 02:28:08 +08:00
|
|
|
flags |= BIND_SYMBOL_FLAGS_WEAK_IMPORT;
|
2020-12-16 10:05:06 +08:00
|
|
|
|
|
|
|
os << flags << sym.getName() << '\0'
|
2021-03-12 02:28:08 +08:00
|
|
|
<< static_cast<uint8_t>(BIND_OPCODE_DO_BIND)
|
|
|
|
<< static_cast<uint8_t>(BIND_OPCODE_DONE);
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
return opstreamOffset;
|
|
|
|
}
|
|
|
|
|
2020-05-02 07:29:06 +08:00
|
|
|
ExportSection::ExportSection()
|
2020-07-31 05:29:14 +08:00
|
|
|
: LinkEditSection(segment_names::linkEdit, section_names::export_) {}
|
2020-04-29 07:58:22 +08:00
|
|
|
|
|
|
|
void ExportSection::finalizeContents() {
|
2020-09-17 02:20:10 +08:00
|
|
|
trieBuilder.setImageBase(in.header->addr);
|
2020-08-28 06:59:15 +08:00
|
|
|
for (const Symbol *sym : symtab->getSymbols()) {
|
|
|
|
if (const auto *defined = dyn_cast<Defined>(sym)) {
|
[lld/mac] Implement -dead_strip
Also adds support for live_support sections, no_dead_strip sections,
.no_dead_strip symbols.
Chromium Framework 345MB unstripped -> 250MB stripped
(vs 290MB unstripped -> 236M stripped with ld64).
Doing dead stripping is a bit faster than not, because so much less
data needs to be processed:
% ministat lld_*
x lld_nostrip.txt
+ lld_strip.txt
N Min Max Median Avg Stddev
x 10 3.929414 4.07692 4.0269079 4.0089678 0.044214794
+ 10 3.8129408 3.9025559 3.8670411 3.8642573 0.024779651
Difference at 95.0% confidence
-0.144711 +/- 0.0336749
-3.60967% +/- 0.839989%
(Student's t, pooled s = 0.0358398)
This interacts with many parts of the linker. I tried to add test coverage
for all added `isLive()` checks, so that some test will fail if any of them
is removed. I checked that the test expectations for the most part match
ld64's behavior (except for live-support-iterations.s, see the comment
in the test). Interacts with:
- debug info
- export tries
- import opcodes
- flags like -exported_symbol(s_list)
- -U / dynamic_lookup
- mod_init_funcs, mod_term_funcs
- weak symbol handling
- unwind info
- stubs
- map files
- -sectcreate
- undefined, dylib, common, defined (both absolute and normal) symbols
It's possible it interacts with more features I didn't think of,
of course.
I also did some manual testing:
- check-llvm check-clang check-lld work with lld with this patch
as host linker and -dead_strip enabled
- Chromium still starts
- Chromium's base_unittests still pass, including unwind tests
Implemenation-wise, this is InputSection-based, so it'll work for
object files with .subsections_via_symbols (which includes all
object files generated by clang). I first based this on the COFF
implementation, but later realized that things are more similar to ELF.
I think it'd be good to refactor MarkLive.cpp to look more like the ELF
part at some point, but I'd like to get a working state checked in first.
Mechanical parts:
- Rename canOmitFromOutput to wasCoalesced (no behavior change)
since it really is for weak coalesced symbols
- Add noDeadStrip to Defined, corresponding to N_NO_DEAD_STRIP
(`.no_dead_strip` in asm)
Fixes PR49276.
Differential Revision: https://reviews.llvm.org/D103324
2021-05-08 05:10:05 +08:00
|
|
|
if (defined->privateExtern || !defined->isLive())
|
2021-03-11 08:45:18 +08:00
|
|
|
continue;
|
2020-04-30 06:42:19 +08:00
|
|
|
trieBuilder.addSymbol(*defined);
|
2020-08-28 06:59:15 +08:00
|
|
|
hasWeakSymbol = hasWeakSymbol || sym->isWeakDef();
|
|
|
|
}
|
|
|
|
}
|
2020-04-30 06:42:19 +08:00
|
|
|
size = trieBuilder.build();
|
2020-04-29 07:58:22 +08:00
|
|
|
}
|
|
|
|
|
2020-05-02 07:29:06 +08:00
|
|
|
void ExportSection::writeTo(uint8_t *buf) const { trieBuilder.writeTo(buf); }
|
2020-04-29 07:58:22 +08:00
|
|
|
|
2021-06-15 10:21:43 +08:00
|
|
|
DataInCodeSection::DataInCodeSection()
|
|
|
|
: LinkEditSection(segment_names::linkEdit, section_names::dataInCode) {}
|
|
|
|
|
|
|
|
template <class LP>
|
|
|
|
static std::vector<MachO::data_in_code_entry> collectDataInCodeEntries() {
|
|
|
|
std::vector<MachO::data_in_code_entry> dataInCodeEntries;
|
|
|
|
for (const InputFile *inputFile : inputFiles) {
|
|
|
|
if (!isa<ObjFile>(inputFile))
|
|
|
|
continue;
|
|
|
|
const ObjFile *objFile = cast<ObjFile>(inputFile);
|
2021-12-11 14:01:14 +08:00
|
|
|
ArrayRef<MachO::data_in_code_entry> entries = objFile->getDataInCode();
|
2021-06-15 10:21:43 +08:00
|
|
|
if (entries.empty())
|
|
|
|
continue;
|
2021-12-11 14:01:14 +08:00
|
|
|
|
|
|
|
assert(is_sorted(dataInCodeEntries, [](const data_in_code_entry &lhs,
|
|
|
|
const data_in_code_entry &rhs) {
|
|
|
|
return lhs.offset < rhs.offset;
|
|
|
|
}));
|
2021-06-15 10:21:43 +08:00
|
|
|
// For each code subsection find 'data in code' entries residing in it.
|
|
|
|
// Compute the new offset values as
|
|
|
|
// <offset within subsection> + <subsection address> - <__TEXT address>.
|
[lld-macho][nfc] Eliminate InputSection::Shared
Earlier in LLD's evolution, I tried to create the illusion that
subsections were indistinguishable from "top-level" sections. Thus, even
though the subsections shared many common field values, I hid those
common values away in a private Shared struct (see D105305). More
recently, however, @gkm added a public `Section` struct in D113241 that
served as an explicit way to store values that are common to an entire
set of subsections (aka InputSections). Now that we have another "common
value" struct, `Shared` has been rendered redundant. All its fields can
be moved into `Section` instead, and the pointer to `Shared` can be replaced
with a pointer to `Section`.
This `Section` pointer also has the advantage of letting us inspect other
subsections easily, simplifying the implementation of {D118798}.
P.S. I do think that having both `Section` and `InputSection` makes for
a slightly confusing naming scheme. I considered renaming `InputSection`
to `Subsection`, but that would break the symmetry with `OutputSection`.
It would also make us deviate from LLD-ELF's naming scheme.
This change is perf-neutral on my 3.2 GHz 16-Core Intel Xeon W machine:
base diff difference (95% CI)
sys_time 1.258 ± 0.031 1.248 ± 0.023 [ -1.6% .. +0.1%]
user_time 3.659 ± 0.047 3.658 ± 0.041 [ -0.5% .. +0.4%]
wall_time 4.640 ± 0.085 4.625 ± 0.063 [ -1.0% .. +0.3%]
samples 49 61
There's also no stat sig change in RSS (as measured by `time -l`):
base diff difference (95% CI)
time 998038627.097 ± 13567305.958 1003327715.556 ± 15210451.236 [ -0.2% .. +1.2%]
samples 31 36
Reviewed By: #lld-macho, oontvoo
Differential Revision: https://reviews.llvm.org/D118797
2022-02-04 08:53:29 +08:00
|
|
|
for (const Section *section : objFile->sections) {
|
|
|
|
for (const Subsection &subsec : section->subsections) {
|
2021-11-05 11:55:31 +08:00
|
|
|
const InputSection *isec = subsec.isec;
|
2021-06-15 10:21:43 +08:00
|
|
|
if (!isCodeSection(isec))
|
|
|
|
continue;
|
|
|
|
if (cast<ConcatInputSection>(isec)->shouldOmitFromOutput())
|
|
|
|
continue;
|
[lld-macho][nfc] Eliminate InputSection::Shared
Earlier in LLD's evolution, I tried to create the illusion that
subsections were indistinguishable from "top-level" sections. Thus, even
though the subsections shared many common field values, I hid those
common values away in a private Shared struct (see D105305). More
recently, however, @gkm added a public `Section` struct in D113241 that
served as an explicit way to store values that are common to an entire
set of subsections (aka InputSections). Now that we have another "common
value" struct, `Shared` has been rendered redundant. All its fields can
be moved into `Section` instead, and the pointer to `Shared` can be replaced
with a pointer to `Section`.
This `Section` pointer also has the advantage of letting us inspect other
subsections easily, simplifying the implementation of {D118798}.
P.S. I do think that having both `Section` and `InputSection` makes for
a slightly confusing naming scheme. I considered renaming `InputSection`
to `Subsection`, but that would break the symmetry with `OutputSection`.
It would also make us deviate from LLD-ELF's naming scheme.
This change is perf-neutral on my 3.2 GHz 16-Core Intel Xeon W machine:
base diff difference (95% CI)
sys_time 1.258 ± 0.031 1.248 ± 0.023 [ -1.6% .. +0.1%]
user_time 3.659 ± 0.047 3.658 ± 0.041 [ -0.5% .. +0.4%]
wall_time 4.640 ± 0.085 4.625 ± 0.063 [ -1.0% .. +0.3%]
samples 49 61
There's also no stat sig change in RSS (as measured by `time -l`):
base diff difference (95% CI)
time 998038627.097 ± 13567305.958 1003327715.556 ± 15210451.236 [ -0.2% .. +1.2%]
samples 31 36
Reviewed By: #lld-macho, oontvoo
Differential Revision: https://reviews.llvm.org/D118797
2022-02-04 08:53:29 +08:00
|
|
|
const uint64_t beginAddr = section->addr + subsec.offset;
|
2021-06-15 10:21:43 +08:00
|
|
|
auto it = llvm::lower_bound(
|
|
|
|
entries, beginAddr,
|
|
|
|
[](const MachO::data_in_code_entry &entry, uint64_t addr) {
|
|
|
|
return entry.offset < addr;
|
|
|
|
});
|
2021-09-11 05:00:43 +08:00
|
|
|
const uint64_t endAddr = beginAddr + isec->getSize();
|
2021-06-15 10:21:43 +08:00
|
|
|
for (const auto end = entries.end();
|
|
|
|
it != end && it->offset + it->length <= endAddr; ++it)
|
|
|
|
dataInCodeEntries.push_back(
|
|
|
|
{static_cast<uint32_t>(isec->getVA(it->offset - beginAddr) -
|
|
|
|
in.header->addr),
|
|
|
|
it->length, it->kind});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return dataInCodeEntries;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DataInCodeSection::finalizeContents() {
|
|
|
|
entries = target->wordSize == 8 ? collectDataInCodeEntries<LP64>()
|
|
|
|
: collectDataInCodeEntries<ILP32>();
|
|
|
|
}
|
|
|
|
|
|
|
|
void DataInCodeSection::writeTo(uint8_t *buf) const {
|
2021-06-15 12:15:54 +08:00
|
|
|
if (!entries.empty())
|
|
|
|
memcpy(buf, entries.data(), getRawSize());
|
2021-06-15 10:21:43 +08:00
|
|
|
}
|
|
|
|
|
2021-03-09 14:00:37 +08:00
|
|
|
FunctionStartsSection::FunctionStartsSection()
|
2021-03-14 06:41:44 +08:00
|
|
|
: LinkEditSection(segment_names::linkEdit, section_names::functionStarts) {}
|
2021-03-09 14:00:37 +08:00
|
|
|
|
|
|
|
void FunctionStartsSection::finalizeContents() {
|
|
|
|
raw_svector_ostream os{contents};
|
2021-06-12 08:47:28 +08:00
|
|
|
std::vector<uint64_t> addrs;
|
2021-11-19 23:56:58 +08:00
|
|
|
for (const InputFile *file : inputFiles) {
|
|
|
|
if (auto *objFile = dyn_cast<ObjFile>(file)) {
|
|
|
|
for (const Symbol *sym : objFile->symbols) {
|
|
|
|
if (const auto *defined = dyn_cast_or_null<Defined>(sym)) {
|
|
|
|
if (!defined->isec || !isCodeSection(defined->isec) ||
|
|
|
|
!defined->isLive())
|
|
|
|
continue;
|
|
|
|
// TODO: Add support for thumbs, in that case
|
|
|
|
// the lowest bit of nextAddr needs to be set to 1.
|
|
|
|
addrs.push_back(defined->getVA());
|
|
|
|
}
|
|
|
|
}
|
2021-03-09 14:00:37 +08:00
|
|
|
}
|
|
|
|
}
|
2021-06-12 08:47:28 +08:00
|
|
|
llvm::sort(addrs);
|
|
|
|
uint64_t addr = in.header->addr;
|
|
|
|
for (uint64_t nextAddr : addrs) {
|
|
|
|
uint64_t delta = nextAddr - addr;
|
|
|
|
if (delta == 0)
|
|
|
|
continue;
|
|
|
|
encodeULEB128(delta, os);
|
|
|
|
addr = nextAddr;
|
|
|
|
}
|
2021-03-09 14:00:37 +08:00
|
|
|
os << '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
void FunctionStartsSection::writeTo(uint8_t *buf) const {
|
|
|
|
memcpy(buf, contents.data(), contents.size());
|
|
|
|
}
|
|
|
|
|
[lld-macho][reland] Add basic symbol table output
This diff implements basic support for writing a symbol table.
Attributes are loosely supported for extern symbols and not at all for
other types.
Initial version by Kellie Medlin <kelliem@fb.com>
Originally committed in a3d95a50ee33 and reverted in fbae153ca583 due to
UBSAN erroring over unaligned writes. That has been fixed in the
current diff with the following changes:
```
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -133,6 +133,9 @@ SymtabSection::SymtabSection(StringTableSection &stringTableSection)
: stringTableSection(stringTableSection) {
segname = segment_names::linkEdit;
name = section_names::symbolTable;
+ // TODO: When we introduce the SyntheticSections superclass, we should make
+ // all synthetic sections aligned to WordSize by default.
+ align = WordSize;
}
size_t SymtabSection::getSize() const {
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -371,6 +371,7 @@ void Writer::assignAddresses(OutputSegment *seg) {
ArrayRef<InputSection *> sections = p.second;
for (InputSection *isec : sections) {
addr = alignTo(addr, isec->align);
+ // We must align the file offsets too to avoid misaligned writes of
+ // structs.
+ fileOff = alignTo(fileOff, isec->align);
isec->addr = addr;
addr += isec->getSize();
fileOff += isec->getFileSize();
@@ -396,6 +397,7 @@ void Writer::writeSections() {
uint64_t fileOff = seg->fileOff;
for (auto § : seg->getSections()) {
for (InputSection *isec : sect.second) {
+ fileOff = alignTo(fileOff, isec->align);
isec->writeTo(buf + fileOff);
fileOff += isec->getFileSize();
}
```
I don't think it's easy to write a test for alignment (that doesn't
involve brittly hard-coding file offsets), so there isn't one... but
UBSAN builds pass now.
Differential Revision: https://reviews.llvm.org/D79050
2020-04-29 07:58:19 +08:00
|
|
|
SymtabSection::SymtabSection(StringTableSection &stringTableSection)
|
2020-08-28 08:43:19 +08:00
|
|
|
: LinkEditSection(segment_names::linkEdit, section_names::symbolTable),
|
2020-07-31 05:29:14 +08:00
|
|
|
stringTableSection(stringTableSection) {}
|
[lld-macho][reland] Add basic symbol table output
This diff implements basic support for writing a symbol table.
Attributes are loosely supported for extern symbols and not at all for
other types.
Initial version by Kellie Medlin <kelliem@fb.com>
Originally committed in a3d95a50ee33 and reverted in fbae153ca583 due to
UBSAN erroring over unaligned writes. That has been fixed in the
current diff with the following changes:
```
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -133,6 +133,9 @@ SymtabSection::SymtabSection(StringTableSection &stringTableSection)
: stringTableSection(stringTableSection) {
segname = segment_names::linkEdit;
name = section_names::symbolTable;
+ // TODO: When we introduce the SyntheticSections superclass, we should make
+ // all synthetic sections aligned to WordSize by default.
+ align = WordSize;
}
size_t SymtabSection::getSize() const {
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -371,6 +371,7 @@ void Writer::assignAddresses(OutputSegment *seg) {
ArrayRef<InputSection *> sections = p.second;
for (InputSection *isec : sections) {
addr = alignTo(addr, isec->align);
+ // We must align the file offsets too to avoid misaligned writes of
+ // structs.
+ fileOff = alignTo(fileOff, isec->align);
isec->addr = addr;
addr += isec->getSize();
fileOff += isec->getFileSize();
@@ -396,6 +397,7 @@ void Writer::writeSections() {
uint64_t fileOff = seg->fileOff;
for (auto § : seg->getSections()) {
for (InputSection *isec : sect.second) {
+ fileOff = alignTo(fileOff, isec->align);
isec->writeTo(buf + fileOff);
fileOff += isec->getFileSize();
}
```
I don't think it's easy to write a test for alignment (that doesn't
involve brittly hard-coding file offsets), so there isn't one... but
UBSAN builds pass now.
Differential Revision: https://reviews.llvm.org/D79050
2020-04-29 07:58:19 +08:00
|
|
|
|
2022-06-21 07:15:57 +08:00
|
|
|
void SymtabSection::emitBeginSourceStab(DWARFUnit *compileUnit) {
|
2021-03-12 02:28:08 +08:00
|
|
|
StabsEntry stab(N_SO);
|
2022-06-21 07:15:57 +08:00
|
|
|
SmallString<261> dir(compileUnit->getCompilationDir());
|
|
|
|
StringRef sep = sys::path::get_separator();
|
|
|
|
// We don't use `path::append` here because we want an empty `dir` to result
|
|
|
|
// in an absolute path. `append` would give us a relative path for that case.
|
|
|
|
if (!dir.endswith(sep))
|
|
|
|
dir += sep;
|
|
|
|
stab.strx = stringTableSection.addString(
|
|
|
|
saver().save(dir + compileUnit->getUnitDIE().getShortName()));
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
stabs.emplace_back(std::move(stab));
|
|
|
|
}
|
|
|
|
|
|
|
|
void SymtabSection::emitEndSourceStab() {
|
2021-03-12 02:28:08 +08:00
|
|
|
StabsEntry stab(N_SO);
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
stab.sect = 1;
|
|
|
|
stabs.emplace_back(std::move(stab));
|
|
|
|
}
|
|
|
|
|
|
|
|
void SymtabSection::emitObjectFileStab(ObjFile *file) {
|
2021-03-12 02:28:08 +08:00
|
|
|
StabsEntry stab(N_OSO);
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
stab.sect = target->cpuSubtype;
|
2020-12-02 06:45:11 +08:00
|
|
|
SmallString<261> path(!file->archiveName.empty() ? file->archiveName
|
|
|
|
: file->getName());
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
std::error_code ec = sys::fs::make_absolute(path);
|
|
|
|
if (ec)
|
2020-12-02 06:45:11 +08:00
|
|
|
fatal("failed to get absolute path for " + path);
|
|
|
|
|
|
|
|
if (!file->archiveName.empty())
|
|
|
|
path.append({"(", file->getName(), ")"});
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
|
2022-01-21 03:53:18 +08:00
|
|
|
StringRef adjustedPath = saver().save(path.str());
|
2021-10-22 10:38:12 +08:00
|
|
|
adjustedPath.consume_front(config->osoPrefix);
|
|
|
|
|
|
|
|
stab.strx = stringTableSection.addString(adjustedPath);
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
stab.desc = 1;
|
2020-12-02 06:45:11 +08:00
|
|
|
stab.value = file->modTime;
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
stabs.emplace_back(std::move(stab));
|
|
|
|
}
|
|
|
|
|
2020-12-02 06:45:12 +08:00
|
|
|
void SymtabSection::emitEndFunStab(Defined *defined) {
|
2021-03-12 02:28:08 +08:00
|
|
|
StabsEntry stab(N_FUN);
|
2021-04-02 08:48:09 +08:00
|
|
|
stab.value = defined->size;
|
2020-12-02 06:45:12 +08:00
|
|
|
stabs.emplace_back(std::move(stab));
|
|
|
|
}
|
|
|
|
|
|
|
|
void SymtabSection::emitStabs() {
|
2021-10-27 12:42:25 +08:00
|
|
|
if (config->omitDebugInfo)
|
|
|
|
return;
|
|
|
|
|
2021-04-09 02:12:20 +08:00
|
|
|
for (const std::string &s : config->astPaths) {
|
|
|
|
StabsEntry astStab(N_AST);
|
|
|
|
astStab.strx = stringTableSection.addString(s);
|
|
|
|
stabs.emplace_back(std::move(astStab));
|
|
|
|
}
|
|
|
|
|
2022-06-02 02:49:19 +08:00
|
|
|
// Cache the file ID for each symbol in an std::pair for faster sorting.
|
|
|
|
using SortingPair = std::pair<Defined *, int>;
|
|
|
|
std::vector<SortingPair> symbolsNeedingStabs;
|
2020-12-02 06:45:12 +08:00
|
|
|
for (const SymtabEntry &entry :
|
|
|
|
concat<SymtabEntry>(localSymbols, externalSymbols)) {
|
|
|
|
Symbol *sym = entry.sym;
|
[lld/mac] Implement -dead_strip
Also adds support for live_support sections, no_dead_strip sections,
.no_dead_strip symbols.
Chromium Framework 345MB unstripped -> 250MB stripped
(vs 290MB unstripped -> 236M stripped with ld64).
Doing dead stripping is a bit faster than not, because so much less
data needs to be processed:
% ministat lld_*
x lld_nostrip.txt
+ lld_strip.txt
N Min Max Median Avg Stddev
x 10 3.929414 4.07692 4.0269079 4.0089678 0.044214794
+ 10 3.8129408 3.9025559 3.8670411 3.8642573 0.024779651
Difference at 95.0% confidence
-0.144711 +/- 0.0336749
-3.60967% +/- 0.839989%
(Student's t, pooled s = 0.0358398)
This interacts with many parts of the linker. I tried to add test coverage
for all added `isLive()` checks, so that some test will fail if any of them
is removed. I checked that the test expectations for the most part match
ld64's behavior (except for live-support-iterations.s, see the comment
in the test). Interacts with:
- debug info
- export tries
- import opcodes
- flags like -exported_symbol(s_list)
- -U / dynamic_lookup
- mod_init_funcs, mod_term_funcs
- weak symbol handling
- unwind info
- stubs
- map files
- -sectcreate
- undefined, dylib, common, defined (both absolute and normal) symbols
It's possible it interacts with more features I didn't think of,
of course.
I also did some manual testing:
- check-llvm check-clang check-lld work with lld with this patch
as host linker and -dead_strip enabled
- Chromium still starts
- Chromium's base_unittests still pass, including unwind tests
Implemenation-wise, this is InputSection-based, so it'll work for
object files with .subsections_via_symbols (which includes all
object files generated by clang). I first based this on the COFF
implementation, but later realized that things are more similar to ELF.
I think it'd be good to refactor MarkLive.cpp to look more like the ELF
part at some point, but I'd like to get a working state checked in first.
Mechanical parts:
- Rename canOmitFromOutput to wasCoalesced (no behavior change)
since it really is for weak coalesced symbols
- Add noDeadStrip to Defined, corresponding to N_NO_DEAD_STRIP
(`.no_dead_strip` in asm)
Fixes PR49276.
Differential Revision: https://reviews.llvm.org/D103324
2021-05-08 05:10:05 +08:00
|
|
|
assert(sym->isLive() &&
|
|
|
|
"dead symbols should not be in localSymbols, externalSymbols");
|
2020-12-02 06:45:12 +08:00
|
|
|
if (auto *defined = dyn_cast<Defined>(sym)) {
|
2022-04-12 03:45:25 +08:00
|
|
|
// Excluded symbols should have been filtered out in finalizeContents().
|
|
|
|
assert(defined->includeInSymtab);
|
|
|
|
|
2020-12-02 06:45:12 +08:00
|
|
|
if (defined->isAbsolute())
|
|
|
|
continue;
|
2022-04-07 03:55:42 +08:00
|
|
|
|
|
|
|
// Constant-folded symbols go in the executable's symbol table, but don't
|
|
|
|
// get a stabs entry.
|
|
|
|
if (defined->wasIdenticalCodeFolded)
|
|
|
|
continue;
|
|
|
|
|
2020-12-02 06:45:12 +08:00
|
|
|
InputSection *isec = defined->isec;
|
2021-07-02 08:33:55 +08:00
|
|
|
ObjFile *file = dyn_cast_or_null<ObjFile>(isec->getFile());
|
2020-12-02 06:45:12 +08:00
|
|
|
if (!file || !file->compileUnit)
|
|
|
|
continue;
|
2022-04-07 20:48:52 +08:00
|
|
|
|
2022-06-02 02:49:19 +08:00
|
|
|
symbolsNeedingStabs.emplace_back(defined, defined->isec->getFile()->id);
|
2020-12-02 06:45:12 +08:00
|
|
|
}
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
}
|
|
|
|
|
2022-06-02 02:53:08 +08:00
|
|
|
llvm::stable_sort(symbolsNeedingStabs,
|
|
|
|
[&](const SortingPair &a, const SortingPair &b) {
|
|
|
|
return a.second < b.second;
|
|
|
|
});
|
2020-12-02 06:45:12 +08:00
|
|
|
|
|
|
|
// Emit STABS symbols so that dsymutil and/or the debugger can map address
|
|
|
|
// regions in the final binary to the source and object files from which they
|
|
|
|
// originated.
|
|
|
|
InputFile *lastFile = nullptr;
|
2022-06-02 02:49:19 +08:00
|
|
|
for (SortingPair &pair : symbolsNeedingStabs) {
|
|
|
|
Defined *defined = pair.first;
|
2020-12-02 06:45:12 +08:00
|
|
|
InputSection *isec = defined->isec;
|
2021-07-02 08:33:55 +08:00
|
|
|
ObjFile *file = cast<ObjFile>(isec->getFile());
|
2020-12-02 06:45:12 +08:00
|
|
|
|
|
|
|
if (lastFile == nullptr || lastFile != file) {
|
|
|
|
if (lastFile != nullptr)
|
|
|
|
emitEndSourceStab();
|
|
|
|
lastFile = file;
|
|
|
|
|
2022-06-21 07:15:57 +08:00
|
|
|
emitBeginSourceStab(file->compileUnit);
|
2020-12-02 06:45:12 +08:00
|
|
|
emitObjectFileStab(file);
|
|
|
|
}
|
|
|
|
|
|
|
|
StabsEntry symStab;
|
2021-10-29 23:00:13 +08:00
|
|
|
symStab.sect = defined->isec->parent->index;
|
2020-12-02 06:45:12 +08:00
|
|
|
symStab.strx = stringTableSection.addString(defined->getName());
|
|
|
|
symStab.value = defined->getVA();
|
|
|
|
|
2020-12-02 06:45:13 +08:00
|
|
|
if (isCodeSection(isec)) {
|
2021-03-12 02:28:08 +08:00
|
|
|
symStab.type = N_FUN;
|
2020-12-02 06:45:12 +08:00
|
|
|
stabs.emplace_back(std::move(symStab));
|
|
|
|
emitEndFunStab(defined);
|
|
|
|
} else {
|
2021-03-12 02:28:08 +08:00
|
|
|
symStab.type = defined->isExternal() ? N_GSYM : N_STSYM;
|
2020-12-02 06:45:12 +08:00
|
|
|
stabs.emplace_back(std::move(symStab));
|
|
|
|
}
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
}
|
2020-12-02 06:45:12 +08:00
|
|
|
|
|
|
|
if (!stabs.empty())
|
|
|
|
emitEndSourceStab();
|
[lld-macho][reland] Add basic symbol table output
This diff implements basic support for writing a symbol table.
Attributes are loosely supported for extern symbols and not at all for
other types.
Initial version by Kellie Medlin <kelliem@fb.com>
Originally committed in a3d95a50ee33 and reverted in fbae153ca583 due to
UBSAN erroring over unaligned writes. That has been fixed in the
current diff with the following changes:
```
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -133,6 +133,9 @@ SymtabSection::SymtabSection(StringTableSection &stringTableSection)
: stringTableSection(stringTableSection) {
segname = segment_names::linkEdit;
name = section_names::symbolTable;
+ // TODO: When we introduce the SyntheticSections superclass, we should make
+ // all synthetic sections aligned to WordSize by default.
+ align = WordSize;
}
size_t SymtabSection::getSize() const {
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -371,6 +371,7 @@ void Writer::assignAddresses(OutputSegment *seg) {
ArrayRef<InputSection *> sections = p.second;
for (InputSection *isec : sections) {
addr = alignTo(addr, isec->align);
+ // We must align the file offsets too to avoid misaligned writes of
+ // structs.
+ fileOff = alignTo(fileOff, isec->align);
isec->addr = addr;
addr += isec->getSize();
fileOff += isec->getFileSize();
@@ -396,6 +397,7 @@ void Writer::writeSections() {
uint64_t fileOff = seg->fileOff;
for (auto § : seg->getSections()) {
for (InputSection *isec : sect.second) {
+ fileOff = alignTo(fileOff, isec->align);
isec->writeTo(buf + fileOff);
fileOff += isec->getFileSize();
}
```
I don't think it's easy to write a test for alignment (that doesn't
involve brittly hard-coding file offsets), so there isn't one... but
UBSAN builds pass now.
Differential Revision: https://reviews.llvm.org/D79050
2020-04-29 07:58:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void SymtabSection::finalizeContents() {
|
2020-12-23 00:00:57 +08:00
|
|
|
auto addSymbol = [&](std::vector<SymtabEntry> &symbols, Symbol *sym) {
|
|
|
|
uint32_t strx = stringTableSection.addString(sym->getName());
|
|
|
|
symbols.push_back({sym, strx});
|
|
|
|
};
|
|
|
|
|
2022-05-20 15:45:26 +08:00
|
|
|
std::function<void(Symbol *)> localSymbolsHandler;
|
|
|
|
switch (config->localSymbolsPresence) {
|
|
|
|
case SymtabPresence::All:
|
|
|
|
localSymbolsHandler = [&](Symbol *sym) { addSymbol(localSymbols, sym); };
|
|
|
|
break;
|
|
|
|
case SymtabPresence::None:
|
|
|
|
localSymbolsHandler = [&](Symbol *) { /* Do nothing*/ };
|
|
|
|
break;
|
|
|
|
case SymtabPresence::SelectivelyIncluded:
|
|
|
|
localSymbolsHandler = [&](Symbol *sym) {
|
|
|
|
if (config->localSymbolPatterns.match(sym->getName()))
|
|
|
|
addSymbol(localSymbols, sym);
|
|
|
|
};
|
|
|
|
break;
|
|
|
|
case SymtabPresence::SelectivelyExcluded:
|
|
|
|
localSymbolsHandler = [&](Symbol *sym) {
|
|
|
|
if (!config->localSymbolPatterns.match(sym->getName()))
|
|
|
|
addSymbol(localSymbols, sym);
|
|
|
|
};
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2020-12-02 06:45:09 +08:00
|
|
|
// Local symbols aren't in the SymbolTable, so we walk the list of object
|
|
|
|
// files to gather them.
|
2022-06-02 00:54:41 +08:00
|
|
|
// But if `-x` is set, then we don't need to. localSymbolsHandler() will do
|
|
|
|
// the right thing regardless, but this check is a perf optimization because
|
|
|
|
// iterating through all the input files and their symbols is expensive.
|
2022-05-20 15:45:26 +08:00
|
|
|
if (config->localSymbolsPresence != SymtabPresence::None) {
|
|
|
|
for (const InputFile *file : inputFiles) {
|
|
|
|
if (auto *objFile = dyn_cast<ObjFile>(file)) {
|
|
|
|
for (Symbol *sym : objFile->symbols) {
|
|
|
|
if (auto *defined = dyn_cast_or_null<Defined>(sym)) {
|
|
|
|
if (defined->isExternal() || !defined->isLive() ||
|
|
|
|
!defined->includeInSymtab)
|
|
|
|
continue;
|
|
|
|
localSymbolsHandler(sym);
|
|
|
|
}
|
2020-12-02 06:45:09 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[lld/mac] Implement support for private extern symbols
Private extern symbols are used for things scoped to the linkage unit.
They cause duplicate symbol errors (so they're in the symbol table,
unlike TU-scoped truly local symbols), but they don't make it into the
export trie. They are created e.g. by compiling with
-fvisibility=hidden.
If two weak symbols have differing privateness, the combined symbol is
non-private external. (Example: inline functions and some TUs that
include the header defining it were built with
-fvisibility-inlines-hidden and some weren't).
A weak private external symbol implicitly has its "weak" dropped and
behaves like a regular strong private external symbol: Weak is an export
trie concept, and private symbols are not in the export trie.
If a weak and a strong symbol have different privateness, the strong
symbol wins.
If two common symbols have differing privateness, the larger symbol
wins. If they have the same size, the privateness of the symbol seen
later during the link wins (!) -- this is a bit lame, but it matches
ld64 and this behavior takes 2 lines less to implement than the less
surprising "result is non-private external), so match ld64.
(Example: `int a` in two .c files, both built with -fcommon,
one built with -fvisibility=hidden and one without.)
This also makes `__dyld_private` a true TU-local symbol, matching ld64.
To make this work, make the `const char*` StringRefZ ctor to correctly
set `size` (without this, writing the string table crashed when calling
getName() on the __dyld_private symbol).
Mention in CommonSymbol's comment that common symbols are now disabled
by default in clang.
Mention in -keep_private_externs's HelpText that the flag only has an
effect with `-r` (which we don't implement yet -- so this patch here
doesn't regress any behavior around -r + -keep_private_externs)). ld64
doesn't explicitly document it, but the commit text of
http://reviews.llvm.org/rL216146 does, and ld64's
OutputFile::buildSymbolTable() checks `_options.outputKind() ==
Options::kObjectFile` before calling `_options.keepPrivateExterns()`
(the only reference to that function).
Fixes PR48536.
Differential Revision: https://reviews.llvm.org/D93609
2020-12-18 02:30:18 +08:00
|
|
|
// __dyld_private is a local symbol too. It's linker-created and doesn't
|
|
|
|
// exist in any object file.
|
2021-04-07 02:05:15 +08:00
|
|
|
if (Defined *dyldPrivate = in.stubHelper->dyldPrivate)
|
2022-05-20 15:45:26 +08:00
|
|
|
localSymbolsHandler(dyldPrivate);
|
[lld/mac] Implement support for private extern symbols
Private extern symbols are used for things scoped to the linkage unit.
They cause duplicate symbol errors (so they're in the symbol table,
unlike TU-scoped truly local symbols), but they don't make it into the
export trie. They are created e.g. by compiling with
-fvisibility=hidden.
If two weak symbols have differing privateness, the combined symbol is
non-private external. (Example: inline functions and some TUs that
include the header defining it were built with
-fvisibility-inlines-hidden and some weren't).
A weak private external symbol implicitly has its "weak" dropped and
behaves like a regular strong private external symbol: Weak is an export
trie concept, and private symbols are not in the export trie.
If a weak and a strong symbol have different privateness, the strong
symbol wins.
If two common symbols have differing privateness, the larger symbol
wins. If they have the same size, the privateness of the symbol seen
later during the link wins (!) -- this is a bit lame, but it matches
ld64 and this behavior takes 2 lines less to implement than the less
surprising "result is non-private external), so match ld64.
(Example: `int a` in two .c files, both built with -fcommon,
one built with -fvisibility=hidden and one without.)
This also makes `__dyld_private` a true TU-local symbol, matching ld64.
To make this work, make the `const char*` StringRefZ ctor to correctly
set `size` (without this, writing the string table crashed when calling
getName() on the __dyld_private symbol).
Mention in CommonSymbol's comment that common symbols are now disabled
by default in clang.
Mention in -keep_private_externs's HelpText that the flag only has an
effect with `-r` (which we don't implement yet -- so this patch here
doesn't regress any behavior around -r + -keep_private_externs)). ld64
doesn't explicitly document it, but the commit text of
http://reviews.llvm.org/rL216146 does, and ld64's
OutputFile::buildSymbolTable() checks `_options.outputKind() ==
Options::kObjectFile` before calling `_options.keepPrivateExterns()`
(the only reference to that function).
Fixes PR48536.
Differential Revision: https://reviews.llvm.org/D93609
2020-12-18 02:30:18 +08:00
|
|
|
|
2020-09-05 09:02:07 +08:00
|
|
|
for (Symbol *sym : symtab->getSymbols()) {
|
[lld/mac] Implement -dead_strip
Also adds support for live_support sections, no_dead_strip sections,
.no_dead_strip symbols.
Chromium Framework 345MB unstripped -> 250MB stripped
(vs 290MB unstripped -> 236M stripped with ld64).
Doing dead stripping is a bit faster than not, because so much less
data needs to be processed:
% ministat lld_*
x lld_nostrip.txt
+ lld_strip.txt
N Min Max Median Avg Stddev
x 10 3.929414 4.07692 4.0269079 4.0089678 0.044214794
+ 10 3.8129408 3.9025559 3.8670411 3.8642573 0.024779651
Difference at 95.0% confidence
-0.144711 +/- 0.0336749
-3.60967% +/- 0.839989%
(Student's t, pooled s = 0.0358398)
This interacts with many parts of the linker. I tried to add test coverage
for all added `isLive()` checks, so that some test will fail if any of them
is removed. I checked that the test expectations for the most part match
ld64's behavior (except for live-support-iterations.s, see the comment
in the test). Interacts with:
- debug info
- export tries
- import opcodes
- flags like -exported_symbol(s_list)
- -U / dynamic_lookup
- mod_init_funcs, mod_term_funcs
- weak symbol handling
- unwind info
- stubs
- map files
- -sectcreate
- undefined, dylib, common, defined (both absolute and normal) symbols
It's possible it interacts with more features I didn't think of,
of course.
I also did some manual testing:
- check-llvm check-clang check-lld work with lld with this patch
as host linker and -dead_strip enabled
- Chromium still starts
- Chromium's base_unittests still pass, including unwind tests
Implemenation-wise, this is InputSection-based, so it'll work for
object files with .subsections_via_symbols (which includes all
object files generated by clang). I first based this on the COFF
implementation, but later realized that things are more similar to ELF.
I think it'd be good to refactor MarkLive.cpp to look more like the ELF
part at some point, but I'd like to get a working state checked in first.
Mechanical parts:
- Rename canOmitFromOutput to wasCoalesced (no behavior change)
since it really is for weak coalesced symbols
- Add noDeadStrip to Defined, corresponding to N_NO_DEAD_STRIP
(`.no_dead_strip` in asm)
Fixes PR49276.
Differential Revision: https://reviews.llvm.org/D103324
2021-05-08 05:10:05 +08:00
|
|
|
if (!sym->isLive())
|
|
|
|
continue;
|
2020-12-02 06:45:09 +08:00
|
|
|
if (auto *defined = dyn_cast<Defined>(sym)) {
|
2021-03-19 06:49:45 +08:00
|
|
|
if (!defined->includeInSymtab)
|
2021-03-13 06:26:12 +08:00
|
|
|
continue;
|
2020-12-02 06:45:09 +08:00
|
|
|
assert(defined->isExternal());
|
2021-05-18 01:49:17 +08:00
|
|
|
if (defined->privateExtern)
|
2022-05-20 15:45:26 +08:00
|
|
|
localSymbolsHandler(defined);
|
2021-05-18 01:49:17 +08:00
|
|
|
else
|
|
|
|
addSymbol(externalSymbols, defined);
|
2020-12-16 10:05:06 +08:00
|
|
|
} else if (auto *dysym = dyn_cast<DylibSymbol>(sym)) {
|
|
|
|
if (dysym->isReferenced())
|
2020-12-23 00:00:57 +08:00
|
|
|
addSymbol(undefinedSymbols, sym);
|
2020-09-05 09:02:07 +08:00
|
|
|
}
|
2020-12-02 06:45:09 +08:00
|
|
|
}
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
|
2020-12-02 06:45:12 +08:00
|
|
|
emitStabs();
|
2020-12-02 06:45:09 +08:00
|
|
|
uint32_t symtabIndex = stabs.size();
|
|
|
|
for (const SymtabEntry &entry :
|
|
|
|
concat<SymtabEntry>(localSymbols, externalSymbols, undefinedSymbols)) {
|
|
|
|
entry.sym->symtabIndex = symtabIndex++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t SymtabSection::getNumSymbols() const {
|
|
|
|
return stabs.size() + localSymbols.size() + externalSymbols.size() +
|
|
|
|
undefinedSymbols.size();
|
[lld-macho][reland] Add basic symbol table output
This diff implements basic support for writing a symbol table.
Attributes are loosely supported for extern symbols and not at all for
other types.
Initial version by Kellie Medlin <kelliem@fb.com>
Originally committed in a3d95a50ee33 and reverted in fbae153ca583 due to
UBSAN erroring over unaligned writes. That has been fixed in the
current diff with the following changes:
```
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -133,6 +133,9 @@ SymtabSection::SymtabSection(StringTableSection &stringTableSection)
: stringTableSection(stringTableSection) {
segname = segment_names::linkEdit;
name = section_names::symbolTable;
+ // TODO: When we introduce the SyntheticSections superclass, we should make
+ // all synthetic sections aligned to WordSize by default.
+ align = WordSize;
}
size_t SymtabSection::getSize() const {
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -371,6 +371,7 @@ void Writer::assignAddresses(OutputSegment *seg) {
ArrayRef<InputSection *> sections = p.second;
for (InputSection *isec : sections) {
addr = alignTo(addr, isec->align);
+ // We must align the file offsets too to avoid misaligned writes of
+ // structs.
+ fileOff = alignTo(fileOff, isec->align);
isec->addr = addr;
addr += isec->getSize();
fileOff += isec->getFileSize();
@@ -396,6 +397,7 @@ void Writer::writeSections() {
uint64_t fileOff = seg->fileOff;
for (auto § : seg->getSections()) {
for (InputSection *isec : sect.second) {
+ fileOff = alignTo(fileOff, isec->align);
isec->writeTo(buf + fileOff);
fileOff += isec->getFileSize();
}
```
I don't think it's easy to write a test for alignment (that doesn't
involve brittly hard-coding file offsets), so there isn't one... but
UBSAN builds pass now.
Differential Revision: https://reviews.llvm.org/D79050
2020-04-29 07:58:19 +08:00
|
|
|
}
|
|
|
|
|
2021-04-03 06:46:18 +08:00
|
|
|
// This serves to hide (type-erase) the template parameter from SymtabSection.
|
2021-06-14 07:43:37 +08:00
|
|
|
template <class LP> class SymtabSectionImpl final : public SymtabSection {
|
2021-04-03 06:46:18 +08:00
|
|
|
public:
|
|
|
|
SymtabSectionImpl(StringTableSection &stringTableSection)
|
|
|
|
: SymtabSection(stringTableSection) {}
|
|
|
|
uint64_t getRawSize() const override;
|
|
|
|
void writeTo(uint8_t *buf) const override;
|
|
|
|
};
|
|
|
|
|
|
|
|
template <class LP> uint64_t SymtabSectionImpl<LP>::getRawSize() const {
|
|
|
|
return getNumSymbols() * sizeof(typename LP::nlist);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class LP> void SymtabSectionImpl<LP>::writeTo(uint8_t *buf) const {
|
|
|
|
auto *nList = reinterpret_cast<typename LP::nlist *>(buf);
|
2020-12-02 06:45:09 +08:00
|
|
|
// Emit the stabs entries before the "real" symbols. We cannot emit them
|
|
|
|
// after as that would render Symbol::symtabIndex inaccurate.
|
|
|
|
for (const StabsEntry &entry : stabs) {
|
|
|
|
nList->n_strx = entry.strx;
|
|
|
|
nList->n_type = entry.type;
|
|
|
|
nList->n_sect = entry.sect;
|
|
|
|
nList->n_desc = entry.desc;
|
|
|
|
nList->n_value = entry.value;
|
|
|
|
++nList;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const SymtabEntry &entry : concat<const SymtabEntry>(
|
|
|
|
localSymbols, externalSymbols, undefinedSymbols)) {
|
2020-04-29 07:58:22 +08:00
|
|
|
nList->n_strx = entry.strx;
|
2020-09-01 11:32:39 +08:00
|
|
|
// TODO populate n_desc with more flags
|
2020-05-12 06:50:22 +08:00
|
|
|
if (auto *defined = dyn_cast<Defined>(entry.sym)) {
|
[lld/mac] Implement support for private extern symbols
Private extern symbols are used for things scoped to the linkage unit.
They cause duplicate symbol errors (so they're in the symbol table,
unlike TU-scoped truly local symbols), but they don't make it into the
export trie. They are created e.g. by compiling with
-fvisibility=hidden.
If two weak symbols have differing privateness, the combined symbol is
non-private external. (Example: inline functions and some TUs that
include the header defining it were built with
-fvisibility-inlines-hidden and some weren't).
A weak private external symbol implicitly has its "weak" dropped and
behaves like a regular strong private external symbol: Weak is an export
trie concept, and private symbols are not in the export trie.
If a weak and a strong symbol have different privateness, the strong
symbol wins.
If two common symbols have differing privateness, the larger symbol
wins. If they have the same size, the privateness of the symbol seen
later during the link wins (!) -- this is a bit lame, but it matches
ld64 and this behavior takes 2 lines less to implement than the less
surprising "result is non-private external), so match ld64.
(Example: `int a` in two .c files, both built with -fcommon,
one built with -fvisibility=hidden and one without.)
This also makes `__dyld_private` a true TU-local symbol, matching ld64.
To make this work, make the `const char*` StringRefZ ctor to correctly
set `size` (without this, writing the string table crashed when calling
getName() on the __dyld_private symbol).
Mention in CommonSymbol's comment that common symbols are now disabled
by default in clang.
Mention in -keep_private_externs's HelpText that the flag only has an
effect with `-r` (which we don't implement yet -- so this patch here
doesn't regress any behavior around -r + -keep_private_externs)). ld64
doesn't explicitly document it, but the commit text of
http://reviews.llvm.org/rL216146 does, and ld64's
OutputFile::buildSymbolTable() checks `_options.outputKind() ==
Options::kObjectFile` before calling `_options.keepPrivateExterns()`
(the only reference to that function).
Fixes PR48536.
Differential Revision: https://reviews.llvm.org/D93609
2020-12-18 02:30:18 +08:00
|
|
|
uint8_t scope = 0;
|
2021-05-18 08:53:55 +08:00
|
|
|
if (defined->privateExtern) {
|
[lld/mac] Implement support for private extern symbols
Private extern symbols are used for things scoped to the linkage unit.
They cause duplicate symbol errors (so they're in the symbol table,
unlike TU-scoped truly local symbols), but they don't make it into the
export trie. They are created e.g. by compiling with
-fvisibility=hidden.
If two weak symbols have differing privateness, the combined symbol is
non-private external. (Example: inline functions and some TUs that
include the header defining it were built with
-fvisibility-inlines-hidden and some weren't).
A weak private external symbol implicitly has its "weak" dropped and
behaves like a regular strong private external symbol: Weak is an export
trie concept, and private symbols are not in the export trie.
If a weak and a strong symbol have different privateness, the strong
symbol wins.
If two common symbols have differing privateness, the larger symbol
wins. If they have the same size, the privateness of the symbol seen
later during the link wins (!) -- this is a bit lame, but it matches
ld64 and this behavior takes 2 lines less to implement than the less
surprising "result is non-private external), so match ld64.
(Example: `int a` in two .c files, both built with -fcommon,
one built with -fvisibility=hidden and one without.)
This also makes `__dyld_private` a true TU-local symbol, matching ld64.
To make this work, make the `const char*` StringRefZ ctor to correctly
set `size` (without this, writing the string table crashed when calling
getName() on the __dyld_private symbol).
Mention in CommonSymbol's comment that common symbols are now disabled
by default in clang.
Mention in -keep_private_externs's HelpText that the flag only has an
effect with `-r` (which we don't implement yet -- so this patch here
doesn't regress any behavior around -r + -keep_private_externs)). ld64
doesn't explicitly document it, but the commit text of
http://reviews.llvm.org/rL216146 does, and ld64's
OutputFile::buildSymbolTable() checks `_options.outputKind() ==
Options::kObjectFile` before calling `_options.keepPrivateExterns()`
(the only reference to that function).
Fixes PR48536.
Differential Revision: https://reviews.llvm.org/D93609
2020-12-18 02:30:18 +08:00
|
|
|
// Private external -- dylib scoped symbol.
|
|
|
|
// Promote to non-external at link time.
|
2021-03-12 02:28:08 +08:00
|
|
|
scope = N_PEXT;
|
[lld/mac] Implement support for private extern symbols
Private extern symbols are used for things scoped to the linkage unit.
They cause duplicate symbol errors (so they're in the symbol table,
unlike TU-scoped truly local symbols), but they don't make it into the
export trie. They are created e.g. by compiling with
-fvisibility=hidden.
If two weak symbols have differing privateness, the combined symbol is
non-private external. (Example: inline functions and some TUs that
include the header defining it were built with
-fvisibility-inlines-hidden and some weren't).
A weak private external symbol implicitly has its "weak" dropped and
behaves like a regular strong private external symbol: Weak is an export
trie concept, and private symbols are not in the export trie.
If a weak and a strong symbol have different privateness, the strong
symbol wins.
If two common symbols have differing privateness, the larger symbol
wins. If they have the same size, the privateness of the symbol seen
later during the link wins (!) -- this is a bit lame, but it matches
ld64 and this behavior takes 2 lines less to implement than the less
surprising "result is non-private external), so match ld64.
(Example: `int a` in two .c files, both built with -fcommon,
one built with -fvisibility=hidden and one without.)
This also makes `__dyld_private` a true TU-local symbol, matching ld64.
To make this work, make the `const char*` StringRefZ ctor to correctly
set `size` (without this, writing the string table crashed when calling
getName() on the __dyld_private symbol).
Mention in CommonSymbol's comment that common symbols are now disabled
by default in clang.
Mention in -keep_private_externs's HelpText that the flag only has an
effect with `-r` (which we don't implement yet -- so this patch here
doesn't regress any behavior around -r + -keep_private_externs)). ld64
doesn't explicitly document it, but the commit text of
http://reviews.llvm.org/rL216146 does, and ld64's
OutputFile::buildSymbolTable() checks `_options.outputKind() ==
Options::kObjectFile` before calling `_options.keepPrivateExterns()`
(the only reference to that function).
Fixes PR48536.
Differential Revision: https://reviews.llvm.org/D93609
2020-12-18 02:30:18 +08:00
|
|
|
} else if (defined->isExternal()) {
|
|
|
|
// Normal global symbol.
|
2021-03-12 02:28:08 +08:00
|
|
|
scope = N_EXT;
|
[lld/mac] Implement support for private extern symbols
Private extern symbols are used for things scoped to the linkage unit.
They cause duplicate symbol errors (so they're in the symbol table,
unlike TU-scoped truly local symbols), but they don't make it into the
export trie. They are created e.g. by compiling with
-fvisibility=hidden.
If two weak symbols have differing privateness, the combined symbol is
non-private external. (Example: inline functions and some TUs that
include the header defining it were built with
-fvisibility-inlines-hidden and some weren't).
A weak private external symbol implicitly has its "weak" dropped and
behaves like a regular strong private external symbol: Weak is an export
trie concept, and private symbols are not in the export trie.
If a weak and a strong symbol have different privateness, the strong
symbol wins.
If two common symbols have differing privateness, the larger symbol
wins. If they have the same size, the privateness of the symbol seen
later during the link wins (!) -- this is a bit lame, but it matches
ld64 and this behavior takes 2 lines less to implement than the less
surprising "result is non-private external), so match ld64.
(Example: `int a` in two .c files, both built with -fcommon,
one built with -fvisibility=hidden and one without.)
This also makes `__dyld_private` a true TU-local symbol, matching ld64.
To make this work, make the `const char*` StringRefZ ctor to correctly
set `size` (without this, writing the string table crashed when calling
getName() on the __dyld_private symbol).
Mention in CommonSymbol's comment that common symbols are now disabled
by default in clang.
Mention in -keep_private_externs's HelpText that the flag only has an
effect with `-r` (which we don't implement yet -- so this patch here
doesn't regress any behavior around -r + -keep_private_externs)). ld64
doesn't explicitly document it, but the commit text of
http://reviews.llvm.org/rL216146 does, and ld64's
OutputFile::buildSymbolTable() checks `_options.outputKind() ==
Options::kObjectFile` before calling `_options.keepPrivateExterns()`
(the only reference to that function).
Fixes PR48536.
Differential Revision: https://reviews.llvm.org/D93609
2020-12-18 02:30:18 +08:00
|
|
|
} else {
|
|
|
|
// TU-local symbol from localSymbols.
|
|
|
|
scope = 0;
|
|
|
|
}
|
|
|
|
|
2020-09-18 23:40:46 +08:00
|
|
|
if (defined->isAbsolute()) {
|
2021-03-12 02:28:08 +08:00
|
|
|
nList->n_type = scope | N_ABS;
|
|
|
|
nList->n_sect = NO_SECT;
|
2020-09-18 23:40:46 +08:00
|
|
|
nList->n_value = defined->value;
|
|
|
|
} else {
|
2021-03-12 02:28:08 +08:00
|
|
|
nList->n_type = scope | N_SECT;
|
2021-10-29 23:00:13 +08:00
|
|
|
nList->n_sect = defined->isec->parent->index;
|
2020-09-18 23:40:46 +08:00
|
|
|
// For the N_SECT symbol type, n_value is the address of the symbol
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
nList->n_value = defined->getVA();
|
2020-09-18 23:40:46 +08:00
|
|
|
}
|
2021-05-01 04:17:26 +08:00
|
|
|
nList->n_desc |= defined->thumb ? N_ARM_THUMB_DEF : 0;
|
2021-03-12 02:28:08 +08:00
|
|
|
nList->n_desc |= defined->isExternalWeakDef() ? N_WEAK_DEF : 0;
|
2021-05-17 21:15:39 +08:00
|
|
|
nList->n_desc |=
|
|
|
|
defined->referencedDynamically ? REFERENCED_DYNAMICALLY : 0;
|
2020-12-16 02:36:15 +08:00
|
|
|
} else if (auto *dysym = dyn_cast<DylibSymbol>(entry.sym)) {
|
|
|
|
uint16_t n_desc = nList->n_desc;
|
2021-03-02 04:25:10 +08:00
|
|
|
int16_t ordinal = ordinalForDylibSymbol(*dysym);
|
2021-03-12 02:28:08 +08:00
|
|
|
if (ordinal == BIND_SPECIAL_DYLIB_FLAT_LOOKUP)
|
|
|
|
SET_LIBRARY_ORDINAL(n_desc, DYNAMIC_LOOKUP_ORDINAL);
|
|
|
|
else if (ordinal == BIND_SPECIAL_DYLIB_MAIN_EXECUTABLE)
|
|
|
|
SET_LIBRARY_ORDINAL(n_desc, EXECUTABLE_ORDINAL);
|
2021-03-02 04:25:10 +08:00
|
|
|
else {
|
|
|
|
assert(ordinal > 0);
|
2021-03-12 02:28:08 +08:00
|
|
|
SET_LIBRARY_ORDINAL(n_desc, static_cast<uint8_t>(ordinal));
|
2021-03-02 04:25:10 +08:00
|
|
|
}
|
2021-02-23 02:03:02 +08:00
|
|
|
|
2021-03-12 02:28:08 +08:00
|
|
|
nList->n_type = N_EXT;
|
|
|
|
n_desc |= dysym->isWeakDef() ? N_WEAK_DEF : 0;
|
|
|
|
n_desc |= dysym->isWeakRef() ? N_WEAK_REF : 0;
|
2020-12-16 02:36:15 +08:00
|
|
|
nList->n_desc = n_desc;
|
[lld-macho][reland] Add basic symbol table output
This diff implements basic support for writing a symbol table.
Attributes are loosely supported for extern symbols and not at all for
other types.
Initial version by Kellie Medlin <kelliem@fb.com>
Originally committed in a3d95a50ee33 and reverted in fbae153ca583 due to
UBSAN erroring over unaligned writes. That has been fixed in the
current diff with the following changes:
```
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -133,6 +133,9 @@ SymtabSection::SymtabSection(StringTableSection &stringTableSection)
: stringTableSection(stringTableSection) {
segname = segment_names::linkEdit;
name = section_names::symbolTable;
+ // TODO: When we introduce the SyntheticSections superclass, we should make
+ // all synthetic sections aligned to WordSize by default.
+ align = WordSize;
}
size_t SymtabSection::getSize() const {
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -371,6 +371,7 @@ void Writer::assignAddresses(OutputSegment *seg) {
ArrayRef<InputSection *> sections = p.second;
for (InputSection *isec : sections) {
addr = alignTo(addr, isec->align);
+ // We must align the file offsets too to avoid misaligned writes of
+ // structs.
+ fileOff = alignTo(fileOff, isec->align);
isec->addr = addr;
addr += isec->getSize();
fileOff += isec->getFileSize();
@@ -396,6 +397,7 @@ void Writer::writeSections() {
uint64_t fileOff = seg->fileOff;
for (auto § : seg->getSections()) {
for (InputSection *isec : sect.second) {
+ fileOff = alignTo(fileOff, isec->align);
isec->writeTo(buf + fileOff);
fileOff += isec->getFileSize();
}
```
I don't think it's easy to write a test for alignment (that doesn't
involve brittly hard-coding file offsets), so there isn't one... but
UBSAN builds pass now.
Differential Revision: https://reviews.llvm.org/D79050
2020-04-29 07:58:19 +08:00
|
|
|
}
|
|
|
|
++nList;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-03 06:46:18 +08:00
|
|
|
template <class LP>
|
|
|
|
SymtabSection *
|
|
|
|
macho::makeSymtabSection(StringTableSection &stringTableSection) {
|
|
|
|
return make<SymtabSectionImpl<LP>>(stringTableSection);
|
|
|
|
}
|
|
|
|
|
2020-09-05 09:02:07 +08:00
|
|
|
IndirectSymtabSection::IndirectSymtabSection()
|
|
|
|
: LinkEditSection(segment_names::linkEdit,
|
|
|
|
section_names::indirectSymbolTable) {}
|
|
|
|
|
|
|
|
uint32_t IndirectSymtabSection::getNumSymbols() const {
|
|
|
|
return in.got->getEntries().size() + in.tlvPointers->getEntries().size() +
|
2021-06-11 08:17:54 +08:00
|
|
|
2 * in.stubs->getEntries().size();
|
2020-09-05 09:02:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool IndirectSymtabSection::isNeeded() const {
|
|
|
|
return in.got->isNeeded() || in.tlvPointers->isNeeded() ||
|
|
|
|
in.stubs->isNeeded();
|
|
|
|
}
|
|
|
|
|
|
|
|
void IndirectSymtabSection::finalizeContents() {
|
|
|
|
uint32_t off = 0;
|
|
|
|
in.got->reserved1 = off;
|
|
|
|
off += in.got->getEntries().size();
|
|
|
|
in.tlvPointers->reserved1 = off;
|
|
|
|
off += in.tlvPointers->getEntries().size();
|
2021-06-11 08:17:54 +08:00
|
|
|
in.stubs->reserved1 = off;
|
|
|
|
off += in.stubs->getEntries().size();
|
|
|
|
in.lazyPointers->reserved1 = off;
|
2020-09-05 09:02:07 +08:00
|
|
|
}
|
|
|
|
|
2021-02-09 02:47:33 +08:00
|
|
|
static uint32_t indirectValue(const Symbol *sym) {
|
[lld/mac] Mark private externs with GOT relocs as LOCAL in indirect symbtab
prepareSymbolRelocation() in Writer.cpp adds both symbols that need binding and
symbols relocated with a pointer relocation to the got.
Pointer relocations are emitted for non-movq GOTPCREL(%rip) loads. (movqs
become GOT_LOADs so that the linker knows they can be relaxed to leaqs, while
others, such as addq, become just GOT -- a pointer relocation -- since they
can't be relaxed in that way).
For example, this C file produces a private_extern GOT relocation when
compiled with -O2 with clang:
extern const char kString[];
const char* g(int a) { return kString + a; }
Linkers need to put pointer-relocated symbols into the GOT, but ld64 marks them
as LOCAL in the indirect symbol table. This matters, since `strip -x` looks at
the indirect symbol table when deciding what to strip.
The indirect symtab emitting code was assuming that only symbols that need
binding are in the GOT, but pointer relocations where there too. Hence, the
code needs to explicitly check if a symbol is a private extern.
Fixes https://crbug.com/1242638, which has some more information in comments 14
and 15. With this patch, the output of `nm -U` on Chromium Framework after
stripping now contains just two symbols when using lld, just like with ld64.
Differential Revision: https://reviews.llvm.org/D111852
2021-10-15 06:32:10 +08:00
|
|
|
if (sym->symtabIndex == UINT32_MAX)
|
|
|
|
return INDIRECT_SYMBOL_LOCAL;
|
|
|
|
if (auto *defined = dyn_cast<Defined>(sym))
|
|
|
|
if (defined->privateExtern)
|
|
|
|
return INDIRECT_SYMBOL_LOCAL;
|
|
|
|
return sym->symtabIndex;
|
2021-02-09 02:47:33 +08:00
|
|
|
}
|
|
|
|
|
2020-09-05 09:02:07 +08:00
|
|
|
void IndirectSymtabSection::writeTo(uint8_t *buf) const {
|
|
|
|
uint32_t off = 0;
|
|
|
|
for (const Symbol *sym : in.got->getEntries()) {
|
2021-02-09 02:47:33 +08:00
|
|
|
write32le(buf + off * sizeof(uint32_t), indirectValue(sym));
|
2020-09-05 09:02:07 +08:00
|
|
|
++off;
|
|
|
|
}
|
|
|
|
for (const Symbol *sym : in.tlvPointers->getEntries()) {
|
2021-02-09 02:47:33 +08:00
|
|
|
write32le(buf + off * sizeof(uint32_t), indirectValue(sym));
|
2020-09-05 09:02:07 +08:00
|
|
|
++off;
|
|
|
|
}
|
|
|
|
for (const Symbol *sym : in.stubs->getEntries()) {
|
2021-02-09 02:47:33 +08:00
|
|
|
write32le(buf + off * sizeof(uint32_t), indirectValue(sym));
|
2020-09-05 09:02:07 +08:00
|
|
|
++off;
|
|
|
|
}
|
2021-06-11 08:17:54 +08:00
|
|
|
// There is a 1:1 correspondence between stubs and LazyPointerSection
|
|
|
|
// entries. But giving __stubs and __la_symbol_ptr the same reserved1
|
|
|
|
// (the offset into the indirect symbol table) so that they both refer
|
|
|
|
// to the same range of offsets confuses `strip`, so write the stubs
|
|
|
|
// symbol table offsets a second time.
|
|
|
|
for (const Symbol *sym : in.stubs->getEntries()) {
|
|
|
|
write32le(buf + off * sizeof(uint32_t), indirectValue(sym));
|
|
|
|
++off;
|
|
|
|
}
|
2020-09-05 09:02:07 +08:00
|
|
|
}
|
|
|
|
|
2020-05-02 07:29:06 +08:00
|
|
|
StringTableSection::StringTableSection()
|
2020-07-31 05:29:14 +08:00
|
|
|
: LinkEditSection(segment_names::linkEdit, section_names::stringTable) {}
|
[lld-macho][reland] Add basic symbol table output
This diff implements basic support for writing a symbol table.
Attributes are loosely supported for extern symbols and not at all for
other types.
Initial version by Kellie Medlin <kelliem@fb.com>
Originally committed in a3d95a50ee33 and reverted in fbae153ca583 due to
UBSAN erroring over unaligned writes. That has been fixed in the
current diff with the following changes:
```
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -133,6 +133,9 @@ SymtabSection::SymtabSection(StringTableSection &stringTableSection)
: stringTableSection(stringTableSection) {
segname = segment_names::linkEdit;
name = section_names::symbolTable;
+ // TODO: When we introduce the SyntheticSections superclass, we should make
+ // all synthetic sections aligned to WordSize by default.
+ align = WordSize;
}
size_t SymtabSection::getSize() const {
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -371,6 +371,7 @@ void Writer::assignAddresses(OutputSegment *seg) {
ArrayRef<InputSection *> sections = p.second;
for (InputSection *isec : sections) {
addr = alignTo(addr, isec->align);
+ // We must align the file offsets too to avoid misaligned writes of
+ // structs.
+ fileOff = alignTo(fileOff, isec->align);
isec->addr = addr;
addr += isec->getSize();
fileOff += isec->getFileSize();
@@ -396,6 +397,7 @@ void Writer::writeSections() {
uint64_t fileOff = seg->fileOff;
for (auto § : seg->getSections()) {
for (InputSection *isec : sect.second) {
+ fileOff = alignTo(fileOff, isec->align);
isec->writeTo(buf + fileOff);
fileOff += isec->getFileSize();
}
```
I don't think it's easy to write a test for alignment (that doesn't
involve brittly hard-coding file offsets), so there isn't one... but
UBSAN builds pass now.
Differential Revision: https://reviews.llvm.org/D79050
2020-04-29 07:58:19 +08:00
|
|
|
|
|
|
|
uint32_t StringTableSection::addString(StringRef str) {
|
|
|
|
uint32_t strx = size;
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
strings.push_back(str); // TODO: consider deduplicating strings
|
[lld-macho][reland] Add basic symbol table output
This diff implements basic support for writing a symbol table.
Attributes are loosely supported for extern symbols and not at all for
other types.
Initial version by Kellie Medlin <kelliem@fb.com>
Originally committed in a3d95a50ee33 and reverted in fbae153ca583 due to
UBSAN erroring over unaligned writes. That has been fixed in the
current diff with the following changes:
```
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -133,6 +133,9 @@ SymtabSection::SymtabSection(StringTableSection &stringTableSection)
: stringTableSection(stringTableSection) {
segname = segment_names::linkEdit;
name = section_names::symbolTable;
+ // TODO: When we introduce the SyntheticSections superclass, we should make
+ // all synthetic sections aligned to WordSize by default.
+ align = WordSize;
}
size_t SymtabSection::getSize() const {
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -371,6 +371,7 @@ void Writer::assignAddresses(OutputSegment *seg) {
ArrayRef<InputSection *> sections = p.second;
for (InputSection *isec : sections) {
addr = alignTo(addr, isec->align);
+ // We must align the file offsets too to avoid misaligned writes of
+ // structs.
+ fileOff = alignTo(fileOff, isec->align);
isec->addr = addr;
addr += isec->getSize();
fileOff += isec->getFileSize();
@@ -396,6 +397,7 @@ void Writer::writeSections() {
uint64_t fileOff = seg->fileOff;
for (auto § : seg->getSections()) {
for (InputSection *isec : sect.second) {
+ fileOff = alignTo(fileOff, isec->align);
isec->writeTo(buf + fileOff);
fileOff += isec->getFileSize();
}
```
I don't think it's easy to write a test for alignment (that doesn't
involve brittly hard-coding file offsets), so there isn't one... but
UBSAN builds pass now.
Differential Revision: https://reviews.llvm.org/D79050
2020-04-29 07:58:19 +08:00
|
|
|
size += str.size() + 1; // account for null terminator
|
|
|
|
return strx;
|
|
|
|
}
|
|
|
|
|
2020-05-02 07:29:06 +08:00
|
|
|
void StringTableSection::writeTo(uint8_t *buf) const {
|
[lld-macho][reland] Add basic symbol table output
This diff implements basic support for writing a symbol table.
Attributes are loosely supported for extern symbols and not at all for
other types.
Initial version by Kellie Medlin <kelliem@fb.com>
Originally committed in a3d95a50ee33 and reverted in fbae153ca583 due to
UBSAN erroring over unaligned writes. That has been fixed in the
current diff with the following changes:
```
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -133,6 +133,9 @@ SymtabSection::SymtabSection(StringTableSection &stringTableSection)
: stringTableSection(stringTableSection) {
segname = segment_names::linkEdit;
name = section_names::symbolTable;
+ // TODO: When we introduce the SyntheticSections superclass, we should make
+ // all synthetic sections aligned to WordSize by default.
+ align = WordSize;
}
size_t SymtabSection::getSize() const {
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -371,6 +371,7 @@ void Writer::assignAddresses(OutputSegment *seg) {
ArrayRef<InputSection *> sections = p.second;
for (InputSection *isec : sections) {
addr = alignTo(addr, isec->align);
+ // We must align the file offsets too to avoid misaligned writes of
+ // structs.
+ fileOff = alignTo(fileOff, isec->align);
isec->addr = addr;
addr += isec->getSize();
fileOff += isec->getFileSize();
@@ -396,6 +397,7 @@ void Writer::writeSections() {
uint64_t fileOff = seg->fileOff;
for (auto § : seg->getSections()) {
for (InputSection *isec : sect.second) {
+ fileOff = alignTo(fileOff, isec->align);
isec->writeTo(buf + fileOff);
fileOff += isec->getFileSize();
}
```
I don't think it's easy to write a test for alignment (that doesn't
involve brittly hard-coding file offsets), so there isn't one... but
UBSAN builds pass now.
Differential Revision: https://reviews.llvm.org/D79050
2020-04-29 07:58:19 +08:00
|
|
|
uint32_t off = 0;
|
|
|
|
for (StringRef str : strings) {
|
|
|
|
memcpy(buf + off, str.data(), str.size());
|
|
|
|
off += str.size() + 1; // account for null terminator
|
|
|
|
}
|
|
|
|
}
|
2021-01-07 10:11:44 +08:00
|
|
|
|
2021-10-02 05:30:21 +08:00
|
|
|
static_assert((CodeSignatureSection::blobHeadersSize % 8) == 0, "");
|
|
|
|
static_assert((CodeSignatureSection::fixedHeadersSize % 8) == 0, "");
|
|
|
|
|
2021-01-07 10:11:44 +08:00
|
|
|
CodeSignatureSection::CodeSignatureSection()
|
|
|
|
: LinkEditSection(segment_names::linkEdit, section_names::codeSignature) {
|
2021-10-02 05:30:21 +08:00
|
|
|
align = 16; // required by libstuff
|
|
|
|
// FIXME: Consider using finalOutput instead of outputFile.
|
|
|
|
fileName = config->outputFile;
|
|
|
|
size_t slashIndex = fileName.rfind("/");
|
|
|
|
if (slashIndex != std::string::npos)
|
|
|
|
fileName = fileName.drop_front(slashIndex + 1);
|
2021-10-27 05:25:34 +08:00
|
|
|
|
|
|
|
// NOTE: Any changes to these calculations should be repeated
|
|
|
|
// in llvm-objcopy's MachOLayoutBuilder::layoutTail.
|
2021-10-02 05:30:21 +08:00
|
|
|
allHeadersSize = alignTo<16>(fixedHeadersSize + fileName.size() + 1);
|
|
|
|
fileNamePad = allHeadersSize - fixedHeadersSize - fileName.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t CodeSignatureSection::getBlockCount() const {
|
|
|
|
return (fileOff + blockSize - 1) / blockSize;
|
2021-01-07 10:11:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t CodeSignatureSection::getRawSize() const {
|
2021-10-02 05:30:21 +08:00
|
|
|
return allHeadersSize + getBlockCount() * hashSize;
|
2021-01-07 10:11:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void CodeSignatureSection::writeHashes(uint8_t *buf) const {
|
2021-10-27 05:25:34 +08:00
|
|
|
// NOTE: Changes to this functionality should be repeated in llvm-objcopy's
|
|
|
|
// MachOWriter::writeSignatureData.
|
2022-06-22 03:12:02 +08:00
|
|
|
uint8_t *hashes = buf + fileOff + allHeadersSize;
|
|
|
|
for (uint64_t i = 0; i < getBlockCount(); ++i) {
|
|
|
|
sha256(buf + i * blockSize,
|
|
|
|
std::min(static_cast<size_t>(fileOff - i * blockSize), blockSize),
|
|
|
|
hashes + i * hashSize);
|
2021-10-02 05:30:21 +08:00
|
|
|
}
|
|
|
|
#if defined(__APPLE__)
|
|
|
|
// This is macOS-specific work-around and makes no sense for any
|
|
|
|
// other host OS. See https://openradar.appspot.com/FB8914231
|
|
|
|
//
|
|
|
|
// The macOS kernel maintains a signature-verification cache to
|
|
|
|
// quickly validate applications at time of execve(2). The trouble
|
|
|
|
// is that for the kernel creates the cache entry at the time of the
|
|
|
|
// mmap(2) call, before we have a chance to write either the code to
|
|
|
|
// sign or the signature header+hashes. The fix is to invalidate
|
|
|
|
// all cached data associated with the output file, thus discarding
|
|
|
|
// the bogus prematurely-cached signature.
|
|
|
|
msync(buf, fileOff + getSize(), MS_INVALIDATE);
|
|
|
|
#endif
|
2021-01-07 10:11:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void CodeSignatureSection::writeTo(uint8_t *buf) const {
|
2021-10-27 05:25:34 +08:00
|
|
|
// NOTE: Changes to this functionality should be repeated in llvm-objcopy's
|
|
|
|
// MachOWriter::writeSignatureData.
|
2021-10-02 05:30:21 +08:00
|
|
|
uint32_t signatureSize = static_cast<uint32_t>(getSize());
|
|
|
|
auto *superBlob = reinterpret_cast<CS_SuperBlob *>(buf);
|
|
|
|
write32be(&superBlob->magic, CSMAGIC_EMBEDDED_SIGNATURE);
|
|
|
|
write32be(&superBlob->length, signatureSize);
|
|
|
|
write32be(&superBlob->count, 1);
|
|
|
|
auto *blobIndex = reinterpret_cast<CS_BlobIndex *>(&superBlob[1]);
|
|
|
|
write32be(&blobIndex->type, CSSLOT_CODEDIRECTORY);
|
|
|
|
write32be(&blobIndex->offset, blobHeadersSize);
|
|
|
|
auto *codeDirectory =
|
|
|
|
reinterpret_cast<CS_CodeDirectory *>(buf + blobHeadersSize);
|
|
|
|
write32be(&codeDirectory->magic, CSMAGIC_CODEDIRECTORY);
|
|
|
|
write32be(&codeDirectory->length, signatureSize - blobHeadersSize);
|
|
|
|
write32be(&codeDirectory->version, CS_SUPPORTSEXECSEG);
|
|
|
|
write32be(&codeDirectory->flags, CS_ADHOC | CS_LINKER_SIGNED);
|
|
|
|
write32be(&codeDirectory->hashOffset,
|
|
|
|
sizeof(CS_CodeDirectory) + fileName.size() + fileNamePad);
|
|
|
|
write32be(&codeDirectory->identOffset, sizeof(CS_CodeDirectory));
|
|
|
|
codeDirectory->nSpecialSlots = 0;
|
|
|
|
write32be(&codeDirectory->nCodeSlots, getBlockCount());
|
|
|
|
write32be(&codeDirectory->codeLimit, fileOff);
|
|
|
|
codeDirectory->hashSize = static_cast<uint8_t>(hashSize);
|
|
|
|
codeDirectory->hashType = kSecCodeSignatureHashSHA256;
|
|
|
|
codeDirectory->platform = 0;
|
|
|
|
codeDirectory->pageSize = blockSizeShift;
|
|
|
|
codeDirectory->spare2 = 0;
|
|
|
|
codeDirectory->scatterOffset = 0;
|
|
|
|
codeDirectory->teamOffset = 0;
|
|
|
|
codeDirectory->spare3 = 0;
|
|
|
|
codeDirectory->codeLimit64 = 0;
|
2021-01-07 10:11:44 +08:00
|
|
|
OutputSegment *textSeg = getOrCreateOutputSegment(segment_names::text);
|
2021-10-02 05:30:21 +08:00
|
|
|
write64be(&codeDirectory->execSegBase, textSeg->fileOff);
|
|
|
|
write64be(&codeDirectory->execSegLimit, textSeg->fileSize);
|
|
|
|
write64be(&codeDirectory->execSegFlags,
|
|
|
|
config->outputType == MH_EXECUTE ? CS_EXECSEG_MAIN_BINARY : 0);
|
|
|
|
auto *id = reinterpret_cast<char *>(&codeDirectory[1]);
|
|
|
|
memcpy(id, fileName.begin(), fileName.size());
|
|
|
|
memset(id + fileName.size(), 0, fileNamePad);
|
2021-01-07 10:11:44 +08:00
|
|
|
}
|
2021-03-19 06:49:45 +08:00
|
|
|
|
2021-04-17 04:46:45 +08:00
|
|
|
BitcodeBundleSection::BitcodeBundleSection()
|
|
|
|
: SyntheticSection(segment_names::llvm, section_names::bitcodeBundle) {}
|
|
|
|
|
|
|
|
class ErrorCodeWrapper {
|
|
|
|
public:
|
2021-04-22 10:09:48 +08:00
|
|
|
explicit ErrorCodeWrapper(std::error_code ec) : errorCode(ec.value()) {}
|
|
|
|
explicit ErrorCodeWrapper(int ec) : errorCode(ec) {}
|
2021-04-17 04:46:45 +08:00
|
|
|
operator int() const { return errorCode; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
int errorCode;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define CHECK_EC(exp) \
|
|
|
|
do { \
|
|
|
|
ErrorCodeWrapper ec(exp); \
|
|
|
|
if (ec) \
|
2021-04-17 04:59:25 +08:00
|
|
|
fatal(Twine("operation failed with error code ") + Twine(ec) + ": " + \
|
|
|
|
#exp); \
|
2021-04-17 04:46:45 +08:00
|
|
|
} while (0);
|
|
|
|
|
|
|
|
void BitcodeBundleSection::finalize() {
|
2021-05-19 23:07:39 +08:00
|
|
|
#ifdef LLVM_HAVE_LIBXAR
|
2021-04-17 04:46:45 +08:00
|
|
|
using namespace llvm::sys::fs;
|
|
|
|
CHECK_EC(createTemporaryFile("bitcode-bundle", "xar", xarPath));
|
|
|
|
|
2022-01-20 05:34:04 +08:00
|
|
|
#pragma clang diagnostic push
|
|
|
|
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
2021-04-17 04:46:45 +08:00
|
|
|
xar_t xar(xar_open(xarPath.data(), O_RDWR));
|
2022-01-20 05:34:04 +08:00
|
|
|
#pragma clang diagnostic pop
|
2021-04-17 04:46:45 +08:00
|
|
|
if (!xar)
|
|
|
|
fatal("failed to open XAR temporary file at " + xarPath);
|
|
|
|
CHECK_EC(xar_opt_set(xar, XAR_OPT_COMPRESSION, XAR_OPT_VAL_NONE));
|
|
|
|
// FIXME: add more data to XAR
|
|
|
|
CHECK_EC(xar_close(xar));
|
|
|
|
|
|
|
|
file_size(xarPath, xarSize);
|
2021-05-19 23:07:39 +08:00
|
|
|
#endif // defined(LLVM_HAVE_LIBXAR)
|
2021-04-17 04:46:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void BitcodeBundleSection::writeTo(uint8_t *buf) const {
|
|
|
|
using namespace llvm::sys::fs;
|
|
|
|
file_t handle =
|
|
|
|
CHECK(openNativeFile(xarPath, CD_OpenExisting, FA_Read, OF_None),
|
|
|
|
"failed to open XAR file");
|
|
|
|
std::error_code ec;
|
|
|
|
mapped_file_region xarMap(handle, mapped_file_region::mapmode::readonly,
|
|
|
|
xarSize, 0, ec);
|
|
|
|
if (ec)
|
|
|
|
fatal("failed to map XAR file");
|
|
|
|
memcpy(buf, xarMap.const_data(), xarSize);
|
|
|
|
|
|
|
|
closeFile(handle);
|
|
|
|
remove(xarPath);
|
|
|
|
}
|
|
|
|
|
2021-06-29 10:22:21 +08:00
|
|
|
CStringSection::CStringSection()
|
|
|
|
: SyntheticSection(segment_names::text, section_names::cString) {
|
|
|
|
flags = S_CSTRING_LITERALS;
|
|
|
|
}
|
|
|
|
|
|
|
|
void CStringSection::addInput(CStringInputSection *isec) {
|
|
|
|
isec->parent = this;
|
|
|
|
inputs.push_back(isec);
|
|
|
|
if (isec->align > align)
|
|
|
|
align = isec->align;
|
|
|
|
}
|
|
|
|
|
|
|
|
void CStringSection::writeTo(uint8_t *buf) const {
|
|
|
|
for (const CStringInputSection *isec : inputs) {
|
|
|
|
for (size_t i = 0, e = isec->pieces.size(); i != e; ++i) {
|
|
|
|
if (!isec->pieces[i].live)
|
|
|
|
continue;
|
|
|
|
StringRef string = isec->getStringRef(i);
|
|
|
|
memcpy(buf + isec->pieces[i].outSecOff, string.data(), string.size());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void CStringSection::finalizeContents() {
|
|
|
|
uint64_t offset = 0;
|
|
|
|
for (CStringInputSection *isec : inputs) {
|
|
|
|
for (size_t i = 0, e = isec->pieces.size(); i != e; ++i) {
|
|
|
|
if (!isec->pieces[i].live)
|
|
|
|
continue;
|
[lld-macho] Align cstrings less conservatively
Previously, we aligned every cstring to 16 bytes as a temporary hack to
deal with https://github.com/llvm/llvm-project/issues/50135. However, it
was highly wasteful in terms of binary size.
To recap, in contrast to ELF, which puts strings that need different
alignments into different sections, `clang`'s Mach-O backend puts them
all in one section. Strings that need to be aligned have the .p2align
directive emitted before them, which simply translates into zero padding
in the object file. In other words, we have to infer the alignment of
the cstrings from their addresses.
We differ slightly from ld64 in how we've chosen to align these
cstrings. Both LLD and ld64 preserve the number of trailing zeros in
each cstring's address in the input object files. When deduplicating
identical cstrings, both linkers pick the cstring whose address has more
trailing zeros, and preserve the alignment of that address in the final
binary. However, ld64 goes a step further and also preserves the offset
of the cstring from the last section-aligned address. I.e. if a cstring
is at offset 18 in the input, with a section alignment of 16, then both
LLD and ld64 will ensure the final address is 2-byte aligned (since
`18 == 16 + 2`). But ld64 will also ensure that the final address is of
the form 16 * k + 2 for some k (which implies 2-byte alignment).
Note that ld64's heuristic means that a dedup'ed cstring's final address is
dependent on the order of the input object files. E.g. if in addition to the
cstring at offset 18 above, we have a duplicate one in another file with a
`.cstring` section alignment of 2 and an offset of zero, then ld64 will pick
the cstring from the object file earlier on the command line (since both have
the same number of trailing zeros in their address). So the final cstring may
either be at some address `16 * k + 2` or at some address `2 * k`.
I've opted not to follow this behavior primarily for implementation
simplicity, and secondarily to save a few more bytes. It's not clear to me
that preserving the section alignment + offset is ever necessary, and there
are many cases that are clearly redundant. In particular, if an x86_64 object
file contains some strings that are accessed via SIMD instructions, then the
.cstring section in the object file will be 16-byte-aligned (since SIMD
requires its operand addresses to be 16-byte aligned). However, there will
typically also be other cstrings in the same file that aren't used via SIMD
and don't need this alignment. They will be emitted at some arbitrary address
`A`, but ld64 will treat them as being 16-byte aligned with an offset of
`16 % A`.
I have verified that the two repros in https://github.com/llvm/llvm-project/issues/50135
work well with the new alignment behavior.
Fixes https://github.com/llvm/llvm-project/issues/54036.
Reviewed By: #lld-macho, oontvoo
Differential Revision: https://reviews.llvm.org/D121342
2022-03-11 04:04:31 +08:00
|
|
|
// See comment above DeduplicatedCStringSection for how alignment is
|
|
|
|
// handled.
|
|
|
|
uint32_t pieceAlign =
|
|
|
|
1 << countTrailingZeros(isec->align | isec->pieces[i].inSecOff);
|
2021-06-29 10:22:21 +08:00
|
|
|
offset = alignTo(offset, pieceAlign);
|
|
|
|
isec->pieces[i].outSecOff = offset;
|
|
|
|
isec->isFinal = true;
|
|
|
|
StringRef string = isec->getStringRef(i);
|
|
|
|
offset += string.size();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
size = offset;
|
|
|
|
}
|
[lld-macho] Align cstrings less conservatively
Previously, we aligned every cstring to 16 bytes as a temporary hack to
deal with https://github.com/llvm/llvm-project/issues/50135. However, it
was highly wasteful in terms of binary size.
To recap, in contrast to ELF, which puts strings that need different
alignments into different sections, `clang`'s Mach-O backend puts them
all in one section. Strings that need to be aligned have the .p2align
directive emitted before them, which simply translates into zero padding
in the object file. In other words, we have to infer the alignment of
the cstrings from their addresses.
We differ slightly from ld64 in how we've chosen to align these
cstrings. Both LLD and ld64 preserve the number of trailing zeros in
each cstring's address in the input object files. When deduplicating
identical cstrings, both linkers pick the cstring whose address has more
trailing zeros, and preserve the alignment of that address in the final
binary. However, ld64 goes a step further and also preserves the offset
of the cstring from the last section-aligned address. I.e. if a cstring
is at offset 18 in the input, with a section alignment of 16, then both
LLD and ld64 will ensure the final address is 2-byte aligned (since
`18 == 16 + 2`). But ld64 will also ensure that the final address is of
the form 16 * k + 2 for some k (which implies 2-byte alignment).
Note that ld64's heuristic means that a dedup'ed cstring's final address is
dependent on the order of the input object files. E.g. if in addition to the
cstring at offset 18 above, we have a duplicate one in another file with a
`.cstring` section alignment of 2 and an offset of zero, then ld64 will pick
the cstring from the object file earlier on the command line (since both have
the same number of trailing zeros in their address). So the final cstring may
either be at some address `16 * k + 2` or at some address `2 * k`.
I've opted not to follow this behavior primarily for implementation
simplicity, and secondarily to save a few more bytes. It's not clear to me
that preserving the section alignment + offset is ever necessary, and there
are many cases that are clearly redundant. In particular, if an x86_64 object
file contains some strings that are accessed via SIMD instructions, then the
.cstring section in the object file will be 16-byte-aligned (since SIMD
requires its operand addresses to be 16-byte aligned). However, there will
typically also be other cstrings in the same file that aren't used via SIMD
and don't need this alignment. They will be emitted at some arbitrary address
`A`, but ld64 will treat them as being 16-byte aligned with an offset of
`16 % A`.
I have verified that the two repros in https://github.com/llvm/llvm-project/issues/50135
work well with the new alignment behavior.
Fixes https://github.com/llvm/llvm-project/issues/54036.
Reviewed By: #lld-macho, oontvoo
Differential Revision: https://reviews.llvm.org/D121342
2022-03-11 04:04:31 +08:00
|
|
|
|
[lld-macho] Implement cstring deduplication
Our implementation draws heavily from LLD-ELF's, which in turn delegates
its string deduplication to llvm-mc's StringTableBuilder. The messiness of
this diff is largely due to the fact that we've previously assumed that
all InputSections get concatenated together to form the output. This is
no longer true with CStringInputSections, which split their contents into
StringPieces. StringPieces are much more lightweight than InputSections,
which is important as we create a lot of them. They may also overlap in
the output, which makes it possible for strings to be tail-merged. In
fact, the initial version of this diff implemented tail merging, but
I've dropped it for reasons I'll explain later.
**Alignment Issues**
Mergeable cstring literals are found under the `__TEXT,__cstring`
section. In contrast to ELF, which puts strings that need different
alignments into different sections, clang's Mach-O backend puts them all
in one section. Strings that need to be aligned have the `.p2align`
directive emitted before them, which simply translates into zero padding
in the object file.
I *think* ld64 extracts the desired per-string alignment from this data
by preserving each string's offset from the last section-aligned
address. I'm not entirely certain since it doesn't seem consistent about
doing this; but perhaps this can be chalked up to cases where ld64 has
to deduplicate strings with different offset/alignment combos -- it
seems to pick one of their alignments to preserve. This doesn't seem
correct in general; we can in fact can induce ld64 to produce a crashing
binary just by linking in an additional object file that only contains
cstrings and no code. See PR50563 for details.
Moreover, this scheme seems rather inefficient: since unaligned and
aligned strings are all put in the same section, which has a single
alignment value, it doesn't seem possible to tell whether a given string
doesn't have any alignment requirements. Preserving offset+alignments
for strings that don't need it is wasteful.
In practice, the crashes seen so far seem to stem from x86_64 SIMD
operations on cstrings. X86_64 requires SIMD accesses to be
16-byte-aligned. So for now, I'm thinking of just aligning all strings
to 16 bytes on x86_64. This is indeed wasteful, but implementation-wise
it's simpler than preserving per-string alignment+offsets. It also
avoids the aforementioned crash after deduplication of
differently-aligned strings. Finally, the overhead is not huge: using
16-byte alignment (vs no alignment) is only a 0.5% size overhead when
linking chromium_framework.
With these alignment requirements, it doesn't make sense to attempt tail
merging -- most strings will not be eligible since their overlaps aren't
likely to start at a 16-byte boundary. Tail-merging (with alignment) for
chromium_framework only improves size by 0.3%.
It's worth noting that LLD-ELF only does tail merging at `-O2`. By
default (at `-O1`), it just deduplicates w/o tail merging. @thakis has
also mentioned that they saw it regress compressed size in some cases
and therefore turned it off. `ld64` does not seem to do tail merging at
all.
**Performance Numbers**
CString deduplication reduces chromium_framework from 250MB to 242MB, or
about a 3.2% reduction.
Numbers for linking chromium_framework on my 3.2 GHz 16-Core Intel Xeon W:
N Min Max Median Avg Stddev
x 20 3.91 4.03 3.935 3.95 0.034641016
+ 20 3.99 4.14 4.015 4.0365 0.0492336
Difference at 95.0% confidence
0.0865 +/- 0.027245
2.18987% +/- 0.689746%
(Student's t, pooled s = 0.0425673)
As expected, cstring merging incurs some non-trivial overhead.
When passing `--no-literal-merge`, it seems that performance is the
same, i.e. the refactoring in this diff didn't cost us.
N Min Max Median Avg Stddev
x 20 3.91 4.03 3.935 3.95 0.034641016
+ 20 3.89 4.02 3.935 3.9435 0.043197831
No difference proven at 95.0% confidence
Reviewed By: #lld-macho, gkm
Differential Revision: https://reviews.llvm.org/D102964
2021-06-08 11:47:12 +08:00
|
|
|
// Mergeable cstring literals are found under the __TEXT,__cstring section. In
|
|
|
|
// contrast to ELF, which puts strings that need different alignments into
|
|
|
|
// different sections, clang's Mach-O backend puts them all in one section.
|
|
|
|
// Strings that need to be aligned have the .p2align directive emitted before
|
[lld-macho] Align cstrings less conservatively
Previously, we aligned every cstring to 16 bytes as a temporary hack to
deal with https://github.com/llvm/llvm-project/issues/50135. However, it
was highly wasteful in terms of binary size.
To recap, in contrast to ELF, which puts strings that need different
alignments into different sections, `clang`'s Mach-O backend puts them
all in one section. Strings that need to be aligned have the .p2align
directive emitted before them, which simply translates into zero padding
in the object file. In other words, we have to infer the alignment of
the cstrings from their addresses.
We differ slightly from ld64 in how we've chosen to align these
cstrings. Both LLD and ld64 preserve the number of trailing zeros in
each cstring's address in the input object files. When deduplicating
identical cstrings, both linkers pick the cstring whose address has more
trailing zeros, and preserve the alignment of that address in the final
binary. However, ld64 goes a step further and also preserves the offset
of the cstring from the last section-aligned address. I.e. if a cstring
is at offset 18 in the input, with a section alignment of 16, then both
LLD and ld64 will ensure the final address is 2-byte aligned (since
`18 == 16 + 2`). But ld64 will also ensure that the final address is of
the form 16 * k + 2 for some k (which implies 2-byte alignment).
Note that ld64's heuristic means that a dedup'ed cstring's final address is
dependent on the order of the input object files. E.g. if in addition to the
cstring at offset 18 above, we have a duplicate one in another file with a
`.cstring` section alignment of 2 and an offset of zero, then ld64 will pick
the cstring from the object file earlier on the command line (since both have
the same number of trailing zeros in their address). So the final cstring may
either be at some address `16 * k + 2` or at some address `2 * k`.
I've opted not to follow this behavior primarily for implementation
simplicity, and secondarily to save a few more bytes. It's not clear to me
that preserving the section alignment + offset is ever necessary, and there
are many cases that are clearly redundant. In particular, if an x86_64 object
file contains some strings that are accessed via SIMD instructions, then the
.cstring section in the object file will be 16-byte-aligned (since SIMD
requires its operand addresses to be 16-byte aligned). However, there will
typically also be other cstrings in the same file that aren't used via SIMD
and don't need this alignment. They will be emitted at some arbitrary address
`A`, but ld64 will treat them as being 16-byte aligned with an offset of
`16 % A`.
I have verified that the two repros in https://github.com/llvm/llvm-project/issues/50135
work well with the new alignment behavior.
Fixes https://github.com/llvm/llvm-project/issues/54036.
Reviewed By: #lld-macho, oontvoo
Differential Revision: https://reviews.llvm.org/D121342
2022-03-11 04:04:31 +08:00
|
|
|
// them, which simply translates into zero padding in the object file. In other
|
|
|
|
// words, we have to infer the desired alignment of these cstrings from their
|
|
|
|
// addresses.
|
[lld-macho] Implement cstring deduplication
Our implementation draws heavily from LLD-ELF's, which in turn delegates
its string deduplication to llvm-mc's StringTableBuilder. The messiness of
this diff is largely due to the fact that we've previously assumed that
all InputSections get concatenated together to form the output. This is
no longer true with CStringInputSections, which split their contents into
StringPieces. StringPieces are much more lightweight than InputSections,
which is important as we create a lot of them. They may also overlap in
the output, which makes it possible for strings to be tail-merged. In
fact, the initial version of this diff implemented tail merging, but
I've dropped it for reasons I'll explain later.
**Alignment Issues**
Mergeable cstring literals are found under the `__TEXT,__cstring`
section. In contrast to ELF, which puts strings that need different
alignments into different sections, clang's Mach-O backend puts them all
in one section. Strings that need to be aligned have the `.p2align`
directive emitted before them, which simply translates into zero padding
in the object file.
I *think* ld64 extracts the desired per-string alignment from this data
by preserving each string's offset from the last section-aligned
address. I'm not entirely certain since it doesn't seem consistent about
doing this; but perhaps this can be chalked up to cases where ld64 has
to deduplicate strings with different offset/alignment combos -- it
seems to pick one of their alignments to preserve. This doesn't seem
correct in general; we can in fact can induce ld64 to produce a crashing
binary just by linking in an additional object file that only contains
cstrings and no code. See PR50563 for details.
Moreover, this scheme seems rather inefficient: since unaligned and
aligned strings are all put in the same section, which has a single
alignment value, it doesn't seem possible to tell whether a given string
doesn't have any alignment requirements. Preserving offset+alignments
for strings that don't need it is wasteful.
In practice, the crashes seen so far seem to stem from x86_64 SIMD
operations on cstrings. X86_64 requires SIMD accesses to be
16-byte-aligned. So for now, I'm thinking of just aligning all strings
to 16 bytes on x86_64. This is indeed wasteful, but implementation-wise
it's simpler than preserving per-string alignment+offsets. It also
avoids the aforementioned crash after deduplication of
differently-aligned strings. Finally, the overhead is not huge: using
16-byte alignment (vs no alignment) is only a 0.5% size overhead when
linking chromium_framework.
With these alignment requirements, it doesn't make sense to attempt tail
merging -- most strings will not be eligible since their overlaps aren't
likely to start at a 16-byte boundary. Tail-merging (with alignment) for
chromium_framework only improves size by 0.3%.
It's worth noting that LLD-ELF only does tail merging at `-O2`. By
default (at `-O1`), it just deduplicates w/o tail merging. @thakis has
also mentioned that they saw it regress compressed size in some cases
and therefore turned it off. `ld64` does not seem to do tail merging at
all.
**Performance Numbers**
CString deduplication reduces chromium_framework from 250MB to 242MB, or
about a 3.2% reduction.
Numbers for linking chromium_framework on my 3.2 GHz 16-Core Intel Xeon W:
N Min Max Median Avg Stddev
x 20 3.91 4.03 3.935 3.95 0.034641016
+ 20 3.99 4.14 4.015 4.0365 0.0492336
Difference at 95.0% confidence
0.0865 +/- 0.027245
2.18987% +/- 0.689746%
(Student's t, pooled s = 0.0425673)
As expected, cstring merging incurs some non-trivial overhead.
When passing `--no-literal-merge`, it seems that performance is the
same, i.e. the refactoring in this diff didn't cost us.
N Min Max Median Avg Stddev
x 20 3.91 4.03 3.935 3.95 0.034641016
+ 20 3.89 4.02 3.935 3.9435 0.043197831
No difference proven at 95.0% confidence
Reviewed By: #lld-macho, gkm
Differential Revision: https://reviews.llvm.org/D102964
2021-06-08 11:47:12 +08:00
|
|
|
//
|
[lld-macho] Align cstrings less conservatively
Previously, we aligned every cstring to 16 bytes as a temporary hack to
deal with https://github.com/llvm/llvm-project/issues/50135. However, it
was highly wasteful in terms of binary size.
To recap, in contrast to ELF, which puts strings that need different
alignments into different sections, `clang`'s Mach-O backend puts them
all in one section. Strings that need to be aligned have the .p2align
directive emitted before them, which simply translates into zero padding
in the object file. In other words, we have to infer the alignment of
the cstrings from their addresses.
We differ slightly from ld64 in how we've chosen to align these
cstrings. Both LLD and ld64 preserve the number of trailing zeros in
each cstring's address in the input object files. When deduplicating
identical cstrings, both linkers pick the cstring whose address has more
trailing zeros, and preserve the alignment of that address in the final
binary. However, ld64 goes a step further and also preserves the offset
of the cstring from the last section-aligned address. I.e. if a cstring
is at offset 18 in the input, with a section alignment of 16, then both
LLD and ld64 will ensure the final address is 2-byte aligned (since
`18 == 16 + 2`). But ld64 will also ensure that the final address is of
the form 16 * k + 2 for some k (which implies 2-byte alignment).
Note that ld64's heuristic means that a dedup'ed cstring's final address is
dependent on the order of the input object files. E.g. if in addition to the
cstring at offset 18 above, we have a duplicate one in another file with a
`.cstring` section alignment of 2 and an offset of zero, then ld64 will pick
the cstring from the object file earlier on the command line (since both have
the same number of trailing zeros in their address). So the final cstring may
either be at some address `16 * k + 2` or at some address `2 * k`.
I've opted not to follow this behavior primarily for implementation
simplicity, and secondarily to save a few more bytes. It's not clear to me
that preserving the section alignment + offset is ever necessary, and there
are many cases that are clearly redundant. In particular, if an x86_64 object
file contains some strings that are accessed via SIMD instructions, then the
.cstring section in the object file will be 16-byte-aligned (since SIMD
requires its operand addresses to be 16-byte aligned). However, there will
typically also be other cstrings in the same file that aren't used via SIMD
and don't need this alignment. They will be emitted at some arbitrary address
`A`, but ld64 will treat them as being 16-byte aligned with an offset of
`16 % A`.
I have verified that the two repros in https://github.com/llvm/llvm-project/issues/50135
work well with the new alignment behavior.
Fixes https://github.com/llvm/llvm-project/issues/54036.
Reviewed By: #lld-macho, oontvoo
Differential Revision: https://reviews.llvm.org/D121342
2022-03-11 04:04:31 +08:00
|
|
|
// We differ slightly from ld64 in how we've chosen to align these cstrings.
|
|
|
|
// Both LLD and ld64 preserve the number of trailing zeros in each cstring's
|
|
|
|
// address in the input object files. When deduplicating identical cstrings,
|
|
|
|
// both linkers pick the cstring whose address has more trailing zeros, and
|
|
|
|
// preserve the alignment of that address in the final binary. However, ld64
|
|
|
|
// goes a step further and also preserves the offset of the cstring from the
|
|
|
|
// last section-aligned address. I.e. if a cstring is at offset 18 in the
|
|
|
|
// input, with a section alignment of 16, then both LLD and ld64 will ensure the
|
|
|
|
// final address is 2-byte aligned (since 18 == 16 + 2). But ld64 will also
|
|
|
|
// ensure that the final address is of the form 16 * k + 2 for some k.
|
[lld-macho] Implement cstring deduplication
Our implementation draws heavily from LLD-ELF's, which in turn delegates
its string deduplication to llvm-mc's StringTableBuilder. The messiness of
this diff is largely due to the fact that we've previously assumed that
all InputSections get concatenated together to form the output. This is
no longer true with CStringInputSections, which split their contents into
StringPieces. StringPieces are much more lightweight than InputSections,
which is important as we create a lot of them. They may also overlap in
the output, which makes it possible for strings to be tail-merged. In
fact, the initial version of this diff implemented tail merging, but
I've dropped it for reasons I'll explain later.
**Alignment Issues**
Mergeable cstring literals are found under the `__TEXT,__cstring`
section. In contrast to ELF, which puts strings that need different
alignments into different sections, clang's Mach-O backend puts them all
in one section. Strings that need to be aligned have the `.p2align`
directive emitted before them, which simply translates into zero padding
in the object file.
I *think* ld64 extracts the desired per-string alignment from this data
by preserving each string's offset from the last section-aligned
address. I'm not entirely certain since it doesn't seem consistent about
doing this; but perhaps this can be chalked up to cases where ld64 has
to deduplicate strings with different offset/alignment combos -- it
seems to pick one of their alignments to preserve. This doesn't seem
correct in general; we can in fact can induce ld64 to produce a crashing
binary just by linking in an additional object file that only contains
cstrings and no code. See PR50563 for details.
Moreover, this scheme seems rather inefficient: since unaligned and
aligned strings are all put in the same section, which has a single
alignment value, it doesn't seem possible to tell whether a given string
doesn't have any alignment requirements. Preserving offset+alignments
for strings that don't need it is wasteful.
In practice, the crashes seen so far seem to stem from x86_64 SIMD
operations on cstrings. X86_64 requires SIMD accesses to be
16-byte-aligned. So for now, I'm thinking of just aligning all strings
to 16 bytes on x86_64. This is indeed wasteful, but implementation-wise
it's simpler than preserving per-string alignment+offsets. It also
avoids the aforementioned crash after deduplication of
differently-aligned strings. Finally, the overhead is not huge: using
16-byte alignment (vs no alignment) is only a 0.5% size overhead when
linking chromium_framework.
With these alignment requirements, it doesn't make sense to attempt tail
merging -- most strings will not be eligible since their overlaps aren't
likely to start at a 16-byte boundary. Tail-merging (with alignment) for
chromium_framework only improves size by 0.3%.
It's worth noting that LLD-ELF only does tail merging at `-O2`. By
default (at `-O1`), it just deduplicates w/o tail merging. @thakis has
also mentioned that they saw it regress compressed size in some cases
and therefore turned it off. `ld64` does not seem to do tail merging at
all.
**Performance Numbers**
CString deduplication reduces chromium_framework from 250MB to 242MB, or
about a 3.2% reduction.
Numbers for linking chromium_framework on my 3.2 GHz 16-Core Intel Xeon W:
N Min Max Median Avg Stddev
x 20 3.91 4.03 3.935 3.95 0.034641016
+ 20 3.99 4.14 4.015 4.0365 0.0492336
Difference at 95.0% confidence
0.0865 +/- 0.027245
2.18987% +/- 0.689746%
(Student's t, pooled s = 0.0425673)
As expected, cstring merging incurs some non-trivial overhead.
When passing `--no-literal-merge`, it seems that performance is the
same, i.e. the refactoring in this diff didn't cost us.
N Min Max Median Avg Stddev
x 20 3.91 4.03 3.935 3.95 0.034641016
+ 20 3.89 4.02 3.935 3.9435 0.043197831
No difference proven at 95.0% confidence
Reviewed By: #lld-macho, gkm
Differential Revision: https://reviews.llvm.org/D102964
2021-06-08 11:47:12 +08:00
|
|
|
//
|
[lld-macho] Align cstrings less conservatively
Previously, we aligned every cstring to 16 bytes as a temporary hack to
deal with https://github.com/llvm/llvm-project/issues/50135. However, it
was highly wasteful in terms of binary size.
To recap, in contrast to ELF, which puts strings that need different
alignments into different sections, `clang`'s Mach-O backend puts them
all in one section. Strings that need to be aligned have the .p2align
directive emitted before them, which simply translates into zero padding
in the object file. In other words, we have to infer the alignment of
the cstrings from their addresses.
We differ slightly from ld64 in how we've chosen to align these
cstrings. Both LLD and ld64 preserve the number of trailing zeros in
each cstring's address in the input object files. When deduplicating
identical cstrings, both linkers pick the cstring whose address has more
trailing zeros, and preserve the alignment of that address in the final
binary. However, ld64 goes a step further and also preserves the offset
of the cstring from the last section-aligned address. I.e. if a cstring
is at offset 18 in the input, with a section alignment of 16, then both
LLD and ld64 will ensure the final address is 2-byte aligned (since
`18 == 16 + 2`). But ld64 will also ensure that the final address is of
the form 16 * k + 2 for some k (which implies 2-byte alignment).
Note that ld64's heuristic means that a dedup'ed cstring's final address is
dependent on the order of the input object files. E.g. if in addition to the
cstring at offset 18 above, we have a duplicate one in another file with a
`.cstring` section alignment of 2 and an offset of zero, then ld64 will pick
the cstring from the object file earlier on the command line (since both have
the same number of trailing zeros in their address). So the final cstring may
either be at some address `16 * k + 2` or at some address `2 * k`.
I've opted not to follow this behavior primarily for implementation
simplicity, and secondarily to save a few more bytes. It's not clear to me
that preserving the section alignment + offset is ever necessary, and there
are many cases that are clearly redundant. In particular, if an x86_64 object
file contains some strings that are accessed via SIMD instructions, then the
.cstring section in the object file will be 16-byte-aligned (since SIMD
requires its operand addresses to be 16-byte aligned). However, there will
typically also be other cstrings in the same file that aren't used via SIMD
and don't need this alignment. They will be emitted at some arbitrary address
`A`, but ld64 will treat them as being 16-byte aligned with an offset of
`16 % A`.
I have verified that the two repros in https://github.com/llvm/llvm-project/issues/50135
work well with the new alignment behavior.
Fixes https://github.com/llvm/llvm-project/issues/54036.
Reviewed By: #lld-macho, oontvoo
Differential Revision: https://reviews.llvm.org/D121342
2022-03-11 04:04:31 +08:00
|
|
|
// Note that ld64's heuristic means that a dedup'ed cstring's final address is
|
|
|
|
// dependent on the order of the input object files. E.g. if in addition to the
|
|
|
|
// cstring at offset 18 above, we have a duplicate one in another file with a
|
|
|
|
// `.cstring` section alignment of 2 and an offset of zero, then ld64 will pick
|
|
|
|
// the cstring from the object file earlier on the command line (since both have
|
|
|
|
// the same number of trailing zeros in their address). So the final cstring may
|
|
|
|
// either be at some address `16 * k + 2` or at some address `2 * k`.
|
|
|
|
//
|
|
|
|
// I've opted not to follow this behavior primarily for implementation
|
|
|
|
// simplicity, and secondarily to save a few more bytes. It's not clear to me
|
|
|
|
// that preserving the section alignment + offset is ever necessary, and there
|
|
|
|
// are many cases that are clearly redundant. In particular, if an x86_64 object
|
|
|
|
// file contains some strings that are accessed via SIMD instructions, then the
|
|
|
|
// .cstring section in the object file will be 16-byte-aligned (since SIMD
|
|
|
|
// requires its operand addresses to be 16-byte aligned). However, there will
|
|
|
|
// typically also be other cstrings in the same file that aren't used via SIMD
|
|
|
|
// and don't need this alignment. They will be emitted at some arbitrary address
|
|
|
|
// `A`, but ld64 will treat them as being 16-byte aligned with an offset of `16
|
|
|
|
// % A`.
|
2021-06-29 10:22:21 +08:00
|
|
|
void DeduplicatedCStringSection::finalizeContents() {
|
[lld-macho] Align cstrings less conservatively
Previously, we aligned every cstring to 16 bytes as a temporary hack to
deal with https://github.com/llvm/llvm-project/issues/50135. However, it
was highly wasteful in terms of binary size.
To recap, in contrast to ELF, which puts strings that need different
alignments into different sections, `clang`'s Mach-O backend puts them
all in one section. Strings that need to be aligned have the .p2align
directive emitted before them, which simply translates into zero padding
in the object file. In other words, we have to infer the alignment of
the cstrings from their addresses.
We differ slightly from ld64 in how we've chosen to align these
cstrings. Both LLD and ld64 preserve the number of trailing zeros in
each cstring's address in the input object files. When deduplicating
identical cstrings, both linkers pick the cstring whose address has more
trailing zeros, and preserve the alignment of that address in the final
binary. However, ld64 goes a step further and also preserves the offset
of the cstring from the last section-aligned address. I.e. if a cstring
is at offset 18 in the input, with a section alignment of 16, then both
LLD and ld64 will ensure the final address is 2-byte aligned (since
`18 == 16 + 2`). But ld64 will also ensure that the final address is of
the form 16 * k + 2 for some k (which implies 2-byte alignment).
Note that ld64's heuristic means that a dedup'ed cstring's final address is
dependent on the order of the input object files. E.g. if in addition to the
cstring at offset 18 above, we have a duplicate one in another file with a
`.cstring` section alignment of 2 and an offset of zero, then ld64 will pick
the cstring from the object file earlier on the command line (since both have
the same number of trailing zeros in their address). So the final cstring may
either be at some address `16 * k + 2` or at some address `2 * k`.
I've opted not to follow this behavior primarily for implementation
simplicity, and secondarily to save a few more bytes. It's not clear to me
that preserving the section alignment + offset is ever necessary, and there
are many cases that are clearly redundant. In particular, if an x86_64 object
file contains some strings that are accessed via SIMD instructions, then the
.cstring section in the object file will be 16-byte-aligned (since SIMD
requires its operand addresses to be 16-byte aligned). However, there will
typically also be other cstrings in the same file that aren't used via SIMD
and don't need this alignment. They will be emitted at some arbitrary address
`A`, but ld64 will treat them as being 16-byte aligned with an offset of
`16 % A`.
I have verified that the two repros in https://github.com/llvm/llvm-project/issues/50135
work well with the new alignment behavior.
Fixes https://github.com/llvm/llvm-project/issues/54036.
Reviewed By: #lld-macho, oontvoo
Differential Revision: https://reviews.llvm.org/D121342
2022-03-11 04:04:31 +08:00
|
|
|
// Find the largest alignment required for each string.
|
|
|
|
for (const CStringInputSection *isec : inputs) {
|
|
|
|
for (size_t i = 0, e = isec->pieces.size(); i != e; ++i) {
|
|
|
|
const StringPiece &piece = isec->pieces[i];
|
|
|
|
if (!piece.live)
|
|
|
|
continue;
|
|
|
|
auto s = isec->getCachedHashStringRef(i);
|
|
|
|
assert(isec->align != 0);
|
|
|
|
uint8_t trailingZeros = countTrailingZeros(isec->align | piece.inSecOff);
|
|
|
|
auto it = stringOffsetMap.insert(
|
|
|
|
std::make_pair(s, StringOffset(trailingZeros)));
|
|
|
|
if (!it.second && it.first->second.trailingZeros < trailingZeros)
|
|
|
|
it.first->second.trailingZeros = trailingZeros;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assign an offset for each string and save it to the corresponding
|
|
|
|
// StringPieces for easy access.
|
2022-01-15 05:12:57 +08:00
|
|
|
for (CStringInputSection *isec : inputs) {
|
[lld-macho] Align cstrings less conservatively
Previously, we aligned every cstring to 16 bytes as a temporary hack to
deal with https://github.com/llvm/llvm-project/issues/50135. However, it
was highly wasteful in terms of binary size.
To recap, in contrast to ELF, which puts strings that need different
alignments into different sections, `clang`'s Mach-O backend puts them
all in one section. Strings that need to be aligned have the .p2align
directive emitted before them, which simply translates into zero padding
in the object file. In other words, we have to infer the alignment of
the cstrings from their addresses.
We differ slightly from ld64 in how we've chosen to align these
cstrings. Both LLD and ld64 preserve the number of trailing zeros in
each cstring's address in the input object files. When deduplicating
identical cstrings, both linkers pick the cstring whose address has more
trailing zeros, and preserve the alignment of that address in the final
binary. However, ld64 goes a step further and also preserves the offset
of the cstring from the last section-aligned address. I.e. if a cstring
is at offset 18 in the input, with a section alignment of 16, then both
LLD and ld64 will ensure the final address is 2-byte aligned (since
`18 == 16 + 2`). But ld64 will also ensure that the final address is of
the form 16 * k + 2 for some k (which implies 2-byte alignment).
Note that ld64's heuristic means that a dedup'ed cstring's final address is
dependent on the order of the input object files. E.g. if in addition to the
cstring at offset 18 above, we have a duplicate one in another file with a
`.cstring` section alignment of 2 and an offset of zero, then ld64 will pick
the cstring from the object file earlier on the command line (since both have
the same number of trailing zeros in their address). So the final cstring may
either be at some address `16 * k + 2` or at some address `2 * k`.
I've opted not to follow this behavior primarily for implementation
simplicity, and secondarily to save a few more bytes. It's not clear to me
that preserving the section alignment + offset is ever necessary, and there
are many cases that are clearly redundant. In particular, if an x86_64 object
file contains some strings that are accessed via SIMD instructions, then the
.cstring section in the object file will be 16-byte-aligned (since SIMD
requires its operand addresses to be 16-byte aligned). However, there will
typically also be other cstrings in the same file that aren't used via SIMD
and don't need this alignment. They will be emitted at some arbitrary address
`A`, but ld64 will treat them as being 16-byte aligned with an offset of
`16 % A`.
I have verified that the two repros in https://github.com/llvm/llvm-project/issues/50135
work well with the new alignment behavior.
Fixes https://github.com/llvm/llvm-project/issues/54036.
Reviewed By: #lld-macho, oontvoo
Differential Revision: https://reviews.llvm.org/D121342
2022-03-11 04:04:31 +08:00
|
|
|
for (size_t i = 0, e = isec->pieces.size(); i != e; ++i) {
|
|
|
|
if (!isec->pieces[i].live)
|
|
|
|
continue;
|
|
|
|
auto s = isec->getCachedHashStringRef(i);
|
|
|
|
auto it = stringOffsetMap.find(s);
|
|
|
|
assert(it != stringOffsetMap.end());
|
|
|
|
StringOffset &offsetInfo = it->second;
|
|
|
|
if (offsetInfo.outSecOff == UINT64_MAX) {
|
2022-04-07 18:19:54 +08:00
|
|
|
offsetInfo.outSecOff = alignTo(size, 1ULL << offsetInfo.trailingZeros);
|
[lld-macho] Align cstrings less conservatively
Previously, we aligned every cstring to 16 bytes as a temporary hack to
deal with https://github.com/llvm/llvm-project/issues/50135. However, it
was highly wasteful in terms of binary size.
To recap, in contrast to ELF, which puts strings that need different
alignments into different sections, `clang`'s Mach-O backend puts them
all in one section. Strings that need to be aligned have the .p2align
directive emitted before them, which simply translates into zero padding
in the object file. In other words, we have to infer the alignment of
the cstrings from their addresses.
We differ slightly from ld64 in how we've chosen to align these
cstrings. Both LLD and ld64 preserve the number of trailing zeros in
each cstring's address in the input object files. When deduplicating
identical cstrings, both linkers pick the cstring whose address has more
trailing zeros, and preserve the alignment of that address in the final
binary. However, ld64 goes a step further and also preserves the offset
of the cstring from the last section-aligned address. I.e. if a cstring
is at offset 18 in the input, with a section alignment of 16, then both
LLD and ld64 will ensure the final address is 2-byte aligned (since
`18 == 16 + 2`). But ld64 will also ensure that the final address is of
the form 16 * k + 2 for some k (which implies 2-byte alignment).
Note that ld64's heuristic means that a dedup'ed cstring's final address is
dependent on the order of the input object files. E.g. if in addition to the
cstring at offset 18 above, we have a duplicate one in another file with a
`.cstring` section alignment of 2 and an offset of zero, then ld64 will pick
the cstring from the object file earlier on the command line (since both have
the same number of trailing zeros in their address). So the final cstring may
either be at some address `16 * k + 2` or at some address `2 * k`.
I've opted not to follow this behavior primarily for implementation
simplicity, and secondarily to save a few more bytes. It's not clear to me
that preserving the section alignment + offset is ever necessary, and there
are many cases that are clearly redundant. In particular, if an x86_64 object
file contains some strings that are accessed via SIMD instructions, then the
.cstring section in the object file will be 16-byte-aligned (since SIMD
requires its operand addresses to be 16-byte aligned). However, there will
typically also be other cstrings in the same file that aren't used via SIMD
and don't need this alignment. They will be emitted at some arbitrary address
`A`, but ld64 will treat them as being 16-byte aligned with an offset of
`16 % A`.
I have verified that the two repros in https://github.com/llvm/llvm-project/issues/50135
work well with the new alignment behavior.
Fixes https://github.com/llvm/llvm-project/issues/54036.
Reviewed By: #lld-macho, oontvoo
Differential Revision: https://reviews.llvm.org/D121342
2022-03-11 04:04:31 +08:00
|
|
|
size = offsetInfo.outSecOff + s.size();
|
|
|
|
}
|
|
|
|
isec->pieces[i].outSecOff = offsetInfo.outSecOff;
|
|
|
|
}
|
2022-01-15 05:12:57 +08:00
|
|
|
isec->isFinal = true;
|
|
|
|
}
|
[lld-macho] Align cstrings less conservatively
Previously, we aligned every cstring to 16 bytes as a temporary hack to
deal with https://github.com/llvm/llvm-project/issues/50135. However, it
was highly wasteful in terms of binary size.
To recap, in contrast to ELF, which puts strings that need different
alignments into different sections, `clang`'s Mach-O backend puts them
all in one section. Strings that need to be aligned have the .p2align
directive emitted before them, which simply translates into zero padding
in the object file. In other words, we have to infer the alignment of
the cstrings from their addresses.
We differ slightly from ld64 in how we've chosen to align these
cstrings. Both LLD and ld64 preserve the number of trailing zeros in
each cstring's address in the input object files. When deduplicating
identical cstrings, both linkers pick the cstring whose address has more
trailing zeros, and preserve the alignment of that address in the final
binary. However, ld64 goes a step further and also preserves the offset
of the cstring from the last section-aligned address. I.e. if a cstring
is at offset 18 in the input, with a section alignment of 16, then both
LLD and ld64 will ensure the final address is 2-byte aligned (since
`18 == 16 + 2`). But ld64 will also ensure that the final address is of
the form 16 * k + 2 for some k (which implies 2-byte alignment).
Note that ld64's heuristic means that a dedup'ed cstring's final address is
dependent on the order of the input object files. E.g. if in addition to the
cstring at offset 18 above, we have a duplicate one in another file with a
`.cstring` section alignment of 2 and an offset of zero, then ld64 will pick
the cstring from the object file earlier on the command line (since both have
the same number of trailing zeros in their address). So the final cstring may
either be at some address `16 * k + 2` or at some address `2 * k`.
I've opted not to follow this behavior primarily for implementation
simplicity, and secondarily to save a few more bytes. It's not clear to me
that preserving the section alignment + offset is ever necessary, and there
are many cases that are clearly redundant. In particular, if an x86_64 object
file contains some strings that are accessed via SIMD instructions, then the
.cstring section in the object file will be 16-byte-aligned (since SIMD
requires its operand addresses to be 16-byte aligned). However, there will
typically also be other cstrings in the same file that aren't used via SIMD
and don't need this alignment. They will be emitted at some arbitrary address
`A`, but ld64 will treat them as being 16-byte aligned with an offset of
`16 % A`.
I have verified that the two repros in https://github.com/llvm/llvm-project/issues/50135
work well with the new alignment behavior.
Fixes https://github.com/llvm/llvm-project/issues/54036.
Reviewed By: #lld-macho, oontvoo
Differential Revision: https://reviews.llvm.org/D121342
2022-03-11 04:04:31 +08:00
|
|
|
}
|
[lld-macho] Implement cstring deduplication
Our implementation draws heavily from LLD-ELF's, which in turn delegates
its string deduplication to llvm-mc's StringTableBuilder. The messiness of
this diff is largely due to the fact that we've previously assumed that
all InputSections get concatenated together to form the output. This is
no longer true with CStringInputSections, which split their contents into
StringPieces. StringPieces are much more lightweight than InputSections,
which is important as we create a lot of them. They may also overlap in
the output, which makes it possible for strings to be tail-merged. In
fact, the initial version of this diff implemented tail merging, but
I've dropped it for reasons I'll explain later.
**Alignment Issues**
Mergeable cstring literals are found under the `__TEXT,__cstring`
section. In contrast to ELF, which puts strings that need different
alignments into different sections, clang's Mach-O backend puts them all
in one section. Strings that need to be aligned have the `.p2align`
directive emitted before them, which simply translates into zero padding
in the object file.
I *think* ld64 extracts the desired per-string alignment from this data
by preserving each string's offset from the last section-aligned
address. I'm not entirely certain since it doesn't seem consistent about
doing this; but perhaps this can be chalked up to cases where ld64 has
to deduplicate strings with different offset/alignment combos -- it
seems to pick one of their alignments to preserve. This doesn't seem
correct in general; we can in fact can induce ld64 to produce a crashing
binary just by linking in an additional object file that only contains
cstrings and no code. See PR50563 for details.
Moreover, this scheme seems rather inefficient: since unaligned and
aligned strings are all put in the same section, which has a single
alignment value, it doesn't seem possible to tell whether a given string
doesn't have any alignment requirements. Preserving offset+alignments
for strings that don't need it is wasteful.
In practice, the crashes seen so far seem to stem from x86_64 SIMD
operations on cstrings. X86_64 requires SIMD accesses to be
16-byte-aligned. So for now, I'm thinking of just aligning all strings
to 16 bytes on x86_64. This is indeed wasteful, but implementation-wise
it's simpler than preserving per-string alignment+offsets. It also
avoids the aforementioned crash after deduplication of
differently-aligned strings. Finally, the overhead is not huge: using
16-byte alignment (vs no alignment) is only a 0.5% size overhead when
linking chromium_framework.
With these alignment requirements, it doesn't make sense to attempt tail
merging -- most strings will not be eligible since their overlaps aren't
likely to start at a 16-byte boundary. Tail-merging (with alignment) for
chromium_framework only improves size by 0.3%.
It's worth noting that LLD-ELF only does tail merging at `-O2`. By
default (at `-O1`), it just deduplicates w/o tail merging. @thakis has
also mentioned that they saw it regress compressed size in some cases
and therefore turned it off. `ld64` does not seem to do tail merging at
all.
**Performance Numbers**
CString deduplication reduces chromium_framework from 250MB to 242MB, or
about a 3.2% reduction.
Numbers for linking chromium_framework on my 3.2 GHz 16-Core Intel Xeon W:
N Min Max Median Avg Stddev
x 20 3.91 4.03 3.935 3.95 0.034641016
+ 20 3.99 4.14 4.015 4.0365 0.0492336
Difference at 95.0% confidence
0.0865 +/- 0.027245
2.18987% +/- 0.689746%
(Student's t, pooled s = 0.0425673)
As expected, cstring merging incurs some non-trivial overhead.
When passing `--no-literal-merge`, it seems that performance is the
same, i.e. the refactoring in this diff didn't cost us.
N Min Max Median Avg Stddev
x 20 3.91 4.03 3.935 3.95 0.034641016
+ 20 3.89 4.02 3.935 3.9435 0.043197831
No difference proven at 95.0% confidence
Reviewed By: #lld-macho, gkm
Differential Revision: https://reviews.llvm.org/D102964
2021-06-08 11:47:12 +08:00
|
|
|
|
[lld-macho] Align cstrings less conservatively
Previously, we aligned every cstring to 16 bytes as a temporary hack to
deal with https://github.com/llvm/llvm-project/issues/50135. However, it
was highly wasteful in terms of binary size.
To recap, in contrast to ELF, which puts strings that need different
alignments into different sections, `clang`'s Mach-O backend puts them
all in one section. Strings that need to be aligned have the .p2align
directive emitted before them, which simply translates into zero padding
in the object file. In other words, we have to infer the alignment of
the cstrings from their addresses.
We differ slightly from ld64 in how we've chosen to align these
cstrings. Both LLD and ld64 preserve the number of trailing zeros in
each cstring's address in the input object files. When deduplicating
identical cstrings, both linkers pick the cstring whose address has more
trailing zeros, and preserve the alignment of that address in the final
binary. However, ld64 goes a step further and also preserves the offset
of the cstring from the last section-aligned address. I.e. if a cstring
is at offset 18 in the input, with a section alignment of 16, then both
LLD and ld64 will ensure the final address is 2-byte aligned (since
`18 == 16 + 2`). But ld64 will also ensure that the final address is of
the form 16 * k + 2 for some k (which implies 2-byte alignment).
Note that ld64's heuristic means that a dedup'ed cstring's final address is
dependent on the order of the input object files. E.g. if in addition to the
cstring at offset 18 above, we have a duplicate one in another file with a
`.cstring` section alignment of 2 and an offset of zero, then ld64 will pick
the cstring from the object file earlier on the command line (since both have
the same number of trailing zeros in their address). So the final cstring may
either be at some address `16 * k + 2` or at some address `2 * k`.
I've opted not to follow this behavior primarily for implementation
simplicity, and secondarily to save a few more bytes. It's not clear to me
that preserving the section alignment + offset is ever necessary, and there
are many cases that are clearly redundant. In particular, if an x86_64 object
file contains some strings that are accessed via SIMD instructions, then the
.cstring section in the object file will be 16-byte-aligned (since SIMD
requires its operand addresses to be 16-byte aligned). However, there will
typically also be other cstrings in the same file that aren't used via SIMD
and don't need this alignment. They will be emitted at some arbitrary address
`A`, but ld64 will treat them as being 16-byte aligned with an offset of
`16 % A`.
I have verified that the two repros in https://github.com/llvm/llvm-project/issues/50135
work well with the new alignment behavior.
Fixes https://github.com/llvm/llvm-project/issues/54036.
Reviewed By: #lld-macho, oontvoo
Differential Revision: https://reviews.llvm.org/D121342
2022-03-11 04:04:31 +08:00
|
|
|
void DeduplicatedCStringSection::writeTo(uint8_t *buf) const {
|
|
|
|
for (const auto &p : stringOffsetMap) {
|
|
|
|
StringRef data = p.first.val();
|
|
|
|
uint64_t off = p.second.outSecOff;
|
|
|
|
if (!data.empty())
|
|
|
|
memcpy(buf + off, data.data(), data.size());
|
|
|
|
}
|
[lld-macho] Implement cstring deduplication
Our implementation draws heavily from LLD-ELF's, which in turn delegates
its string deduplication to llvm-mc's StringTableBuilder. The messiness of
this diff is largely due to the fact that we've previously assumed that
all InputSections get concatenated together to form the output. This is
no longer true with CStringInputSections, which split their contents into
StringPieces. StringPieces are much more lightweight than InputSections,
which is important as we create a lot of them. They may also overlap in
the output, which makes it possible for strings to be tail-merged. In
fact, the initial version of this diff implemented tail merging, but
I've dropped it for reasons I'll explain later.
**Alignment Issues**
Mergeable cstring literals are found under the `__TEXT,__cstring`
section. In contrast to ELF, which puts strings that need different
alignments into different sections, clang's Mach-O backend puts them all
in one section. Strings that need to be aligned have the `.p2align`
directive emitted before them, which simply translates into zero padding
in the object file.
I *think* ld64 extracts the desired per-string alignment from this data
by preserving each string's offset from the last section-aligned
address. I'm not entirely certain since it doesn't seem consistent about
doing this; but perhaps this can be chalked up to cases where ld64 has
to deduplicate strings with different offset/alignment combos -- it
seems to pick one of their alignments to preserve. This doesn't seem
correct in general; we can in fact can induce ld64 to produce a crashing
binary just by linking in an additional object file that only contains
cstrings and no code. See PR50563 for details.
Moreover, this scheme seems rather inefficient: since unaligned and
aligned strings are all put in the same section, which has a single
alignment value, it doesn't seem possible to tell whether a given string
doesn't have any alignment requirements. Preserving offset+alignments
for strings that don't need it is wasteful.
In practice, the crashes seen so far seem to stem from x86_64 SIMD
operations on cstrings. X86_64 requires SIMD accesses to be
16-byte-aligned. So for now, I'm thinking of just aligning all strings
to 16 bytes on x86_64. This is indeed wasteful, but implementation-wise
it's simpler than preserving per-string alignment+offsets. It also
avoids the aforementioned crash after deduplication of
differently-aligned strings. Finally, the overhead is not huge: using
16-byte alignment (vs no alignment) is only a 0.5% size overhead when
linking chromium_framework.
With these alignment requirements, it doesn't make sense to attempt tail
merging -- most strings will not be eligible since their overlaps aren't
likely to start at a 16-byte boundary. Tail-merging (with alignment) for
chromium_framework only improves size by 0.3%.
It's worth noting that LLD-ELF only does tail merging at `-O2`. By
default (at `-O1`), it just deduplicates w/o tail merging. @thakis has
also mentioned that they saw it regress compressed size in some cases
and therefore turned it off. `ld64` does not seem to do tail merging at
all.
**Performance Numbers**
CString deduplication reduces chromium_framework from 250MB to 242MB, or
about a 3.2% reduction.
Numbers for linking chromium_framework on my 3.2 GHz 16-Core Intel Xeon W:
N Min Max Median Avg Stddev
x 20 3.91 4.03 3.935 3.95 0.034641016
+ 20 3.99 4.14 4.015 4.0365 0.0492336
Difference at 95.0% confidence
0.0865 +/- 0.027245
2.18987% +/- 0.689746%
(Student's t, pooled s = 0.0425673)
As expected, cstring merging incurs some non-trivial overhead.
When passing `--no-literal-merge`, it seems that performance is the
same, i.e. the refactoring in this diff didn't cost us.
N Min Max Median Avg Stddev
x 20 3.91 4.03 3.935 3.95 0.034641016
+ 20 3.89 4.02 3.935 3.9435 0.043197831
No difference proven at 95.0% confidence
Reviewed By: #lld-macho, gkm
Differential Revision: https://reviews.llvm.org/D102964
2021-06-08 11:47:12 +08:00
|
|
|
}
|
|
|
|
|
[lld-macho] Deduplicate fixed-width literals
Conceptually, the implementation is pretty straightforward: we put each
literal value into a hashtable, and then write out the keys of that
hashtable at the end.
In contrast with ELF, the Mach-O format does not support variable-length
literals that aren't strings. Its literals are either 4, 8, or 16 bytes
in length. LLD-ELF dedups its literals via sorting + uniq'ing, but since
we don't need to worry about overly-long values, we should be able to do
a faster job by just hashing.
That said, the implementation right now is far from optimal, because we
add to those hashtables serially. To parallelize this, we'll need a
basic concurrent hashtable (only needs to support concurrent writes w/o
interleave reads), which shouldn't be to hard to implement, but I'd like
to punt on it for now.
Numbers for linking chromium_framework on my 3.2 GHz 16-Core Intel Xeon W:
N Min Max Median Avg Stddev
x 20 4.27 4.39 4.315 4.3225 0.033225703
+ 20 4.36 4.82 4.44 4.4845 0.13152846
Difference at 95.0% confidence
0.162 +/- 0.0613971
3.74783% +/- 1.42041%
(Student's t, pooled s = 0.0959262)
This corresponds to binary size savings of 2MB out of 335MB, or 0.6%.
It's not a great tradeoff as-is, but as mentioned our implementation can
be signficantly optimized, and literal dedup will unlock more
opportunities for ICF to identify identical structures that reference
the same literals.
Reviewed By: #lld-macho, gkm
Differential Revision: https://reviews.llvm.org/D103113
2021-06-12 07:49:50 +08:00
|
|
|
// This section is actually emitted as __TEXT,__const by ld64, but clang may
|
|
|
|
// emit input sections of that name, and LLD doesn't currently support mixing
|
|
|
|
// synthetic and concat-type OutputSections. To work around this, I've given
|
|
|
|
// our merged-literals section a different name.
|
|
|
|
WordLiteralSection::WordLiteralSection()
|
|
|
|
: SyntheticSection(segment_names::text, section_names::literals) {
|
|
|
|
align = 16;
|
|
|
|
}
|
|
|
|
|
|
|
|
void WordLiteralSection::addInput(WordLiteralInputSection *isec) {
|
|
|
|
isec->parent = this;
|
[lld-macho] Move ICF earlier to avoid emitting redundant binds
This is a pretty big refactoring diff, so here are the motivations:
Previously, ICF ran after scanRelocations(), where we emitting
bind/rebase opcodes etc. So we had a bunch of redundant leftovers after
ICF. Having ICF run before Writer seems like a better design, and is
what LLD-ELF does, so this diff refactors it accordingly.
However, ICF had two dependencies on things occurring in Writer: 1) it
needs literals to be deduplicated beforehand and 2) it needs to know
which functions have unwind info, which was being handled by
`UnwindInfoSection::prepareRelocations()`.
In order to do literal deduplication earlier, we need to add literal
input sections to their corresponding output sections. So instead of
putting all input sections into the big `inputSections` vector, and then
filtering them by type later on, I've changed things so that literal
sections get added directly to their output sections during the 'gather'
phase. Likewise for compact unwind sections -- they get added directly
to the UnwindInfoSection now. This latter change is not strictly
necessary, but makes it easier for ICF to determine which functions have
unwind info.
Adding literal sections directly to their output sections means that we
can no longer determine `inputOrder` from iterating over
`inputSections`. Instead, we store that order explicitly on
InputSection. Bloating the size of InputSection for this purpose would
be unfortunate -- but LLD-ELF has already solved this problem: it reuses
`outSecOff` to store this order value.
One downside of this refactor is that we now make an additional pass
over the unwind info relocations to figure out which functions have
unwind info, since want to know that before `processRelocations()`. I've
made sure to run that extra loop only if ICF is enabled, so there should
be no overhead in non-optimizing runs of the linker.
The upside of all this is that the `inputSections` vector now contains
only ConcatInputSections that are destined for ConcatOutputSections, so
we can clean up a bunch of code that just existed to filter out other
elements from that vector.
I will test for the lack of redundant binds/rebases in the upcoming
cfstring deduplication diff. While binds/rebases can also happen in the
regular `.text` section, they're more common in `.data` sections, so it
seems more natural to test it that way.
This change is perf-neutral when linking chromium_framework.
Reviewed By: oontvoo
Differential Revision: https://reviews.llvm.org/D105044
2021-07-02 08:33:42 +08:00
|
|
|
inputs.push_back(isec);
|
|
|
|
}
|
|
|
|
|
|
|
|
void WordLiteralSection::finalizeContents() {
|
|
|
|
for (WordLiteralInputSection *isec : inputs) {
|
|
|
|
// We do all processing of the InputSection here, so it will be effectively
|
|
|
|
// finalized.
|
|
|
|
isec->isFinal = true;
|
|
|
|
const uint8_t *buf = isec->data.data();
|
2021-07-02 08:33:55 +08:00
|
|
|
switch (sectionType(isec->getFlags())) {
|
[lld-macho] Move ICF earlier to avoid emitting redundant binds
This is a pretty big refactoring diff, so here are the motivations:
Previously, ICF ran after scanRelocations(), where we emitting
bind/rebase opcodes etc. So we had a bunch of redundant leftovers after
ICF. Having ICF run before Writer seems like a better design, and is
what LLD-ELF does, so this diff refactors it accordingly.
However, ICF had two dependencies on things occurring in Writer: 1) it
needs literals to be deduplicated beforehand and 2) it needs to know
which functions have unwind info, which was being handled by
`UnwindInfoSection::prepareRelocations()`.
In order to do literal deduplication earlier, we need to add literal
input sections to their corresponding output sections. So instead of
putting all input sections into the big `inputSections` vector, and then
filtering them by type later on, I've changed things so that literal
sections get added directly to their output sections during the 'gather'
phase. Likewise for compact unwind sections -- they get added directly
to the UnwindInfoSection now. This latter change is not strictly
necessary, but makes it easier for ICF to determine which functions have
unwind info.
Adding literal sections directly to their output sections means that we
can no longer determine `inputOrder` from iterating over
`inputSections`. Instead, we store that order explicitly on
InputSection. Bloating the size of InputSection for this purpose would
be unfortunate -- but LLD-ELF has already solved this problem: it reuses
`outSecOff` to store this order value.
One downside of this refactor is that we now make an additional pass
over the unwind info relocations to figure out which functions have
unwind info, since want to know that before `processRelocations()`. I've
made sure to run that extra loop only if ICF is enabled, so there should
be no overhead in non-optimizing runs of the linker.
The upside of all this is that the `inputSections` vector now contains
only ConcatInputSections that are destined for ConcatOutputSections, so
we can clean up a bunch of code that just existed to filter out other
elements from that vector.
I will test for the lack of redundant binds/rebases in the upcoming
cfstring deduplication diff. While binds/rebases can also happen in the
regular `.text` section, they're more common in `.data` sections, so it
seems more natural to test it that way.
This change is perf-neutral when linking chromium_framework.
Reviewed By: oontvoo
Differential Revision: https://reviews.llvm.org/D105044
2021-07-02 08:33:42 +08:00
|
|
|
case S_4BYTE_LITERALS: {
|
|
|
|
for (size_t off = 0, e = isec->data.size(); off < e; off += 4) {
|
|
|
|
if (!isec->isLive(off))
|
|
|
|
continue;
|
|
|
|
uint32_t value = *reinterpret_cast<const uint32_t *>(buf + off);
|
|
|
|
literal4Map.emplace(value, literal4Map.size());
|
|
|
|
}
|
|
|
|
break;
|
[lld-macho] Deduplicate fixed-width literals
Conceptually, the implementation is pretty straightforward: we put each
literal value into a hashtable, and then write out the keys of that
hashtable at the end.
In contrast with ELF, the Mach-O format does not support variable-length
literals that aren't strings. Its literals are either 4, 8, or 16 bytes
in length. LLD-ELF dedups its literals via sorting + uniq'ing, but since
we don't need to worry about overly-long values, we should be able to do
a faster job by just hashing.
That said, the implementation right now is far from optimal, because we
add to those hashtables serially. To parallelize this, we'll need a
basic concurrent hashtable (only needs to support concurrent writes w/o
interleave reads), which shouldn't be to hard to implement, but I'd like
to punt on it for now.
Numbers for linking chromium_framework on my 3.2 GHz 16-Core Intel Xeon W:
N Min Max Median Avg Stddev
x 20 4.27 4.39 4.315 4.3225 0.033225703
+ 20 4.36 4.82 4.44 4.4845 0.13152846
Difference at 95.0% confidence
0.162 +/- 0.0613971
3.74783% +/- 1.42041%
(Student's t, pooled s = 0.0959262)
This corresponds to binary size savings of 2MB out of 335MB, or 0.6%.
It's not a great tradeoff as-is, but as mentioned our implementation can
be signficantly optimized, and literal dedup will unlock more
opportunities for ICF to identify identical structures that reference
the same literals.
Reviewed By: #lld-macho, gkm
Differential Revision: https://reviews.llvm.org/D103113
2021-06-12 07:49:50 +08:00
|
|
|
}
|
[lld-macho] Move ICF earlier to avoid emitting redundant binds
This is a pretty big refactoring diff, so here are the motivations:
Previously, ICF ran after scanRelocations(), where we emitting
bind/rebase opcodes etc. So we had a bunch of redundant leftovers after
ICF. Having ICF run before Writer seems like a better design, and is
what LLD-ELF does, so this diff refactors it accordingly.
However, ICF had two dependencies on things occurring in Writer: 1) it
needs literals to be deduplicated beforehand and 2) it needs to know
which functions have unwind info, which was being handled by
`UnwindInfoSection::prepareRelocations()`.
In order to do literal deduplication earlier, we need to add literal
input sections to their corresponding output sections. So instead of
putting all input sections into the big `inputSections` vector, and then
filtering them by type later on, I've changed things so that literal
sections get added directly to their output sections during the 'gather'
phase. Likewise for compact unwind sections -- they get added directly
to the UnwindInfoSection now. This latter change is not strictly
necessary, but makes it easier for ICF to determine which functions have
unwind info.
Adding literal sections directly to their output sections means that we
can no longer determine `inputOrder` from iterating over
`inputSections`. Instead, we store that order explicitly on
InputSection. Bloating the size of InputSection for this purpose would
be unfortunate -- but LLD-ELF has already solved this problem: it reuses
`outSecOff` to store this order value.
One downside of this refactor is that we now make an additional pass
over the unwind info relocations to figure out which functions have
unwind info, since want to know that before `processRelocations()`. I've
made sure to run that extra loop only if ICF is enabled, so there should
be no overhead in non-optimizing runs of the linker.
The upside of all this is that the `inputSections` vector now contains
only ConcatInputSections that are destined for ConcatOutputSections, so
we can clean up a bunch of code that just existed to filter out other
elements from that vector.
I will test for the lack of redundant binds/rebases in the upcoming
cfstring deduplication diff. While binds/rebases can also happen in the
regular `.text` section, they're more common in `.data` sections, so it
seems more natural to test it that way.
This change is perf-neutral when linking chromium_framework.
Reviewed By: oontvoo
Differential Revision: https://reviews.llvm.org/D105044
2021-07-02 08:33:42 +08:00
|
|
|
case S_8BYTE_LITERALS: {
|
|
|
|
for (size_t off = 0, e = isec->data.size(); off < e; off += 8) {
|
|
|
|
if (!isec->isLive(off))
|
|
|
|
continue;
|
|
|
|
uint64_t value = *reinterpret_cast<const uint64_t *>(buf + off);
|
|
|
|
literal8Map.emplace(value, literal8Map.size());
|
|
|
|
}
|
|
|
|
break;
|
[lld-macho] Deduplicate fixed-width literals
Conceptually, the implementation is pretty straightforward: we put each
literal value into a hashtable, and then write out the keys of that
hashtable at the end.
In contrast with ELF, the Mach-O format does not support variable-length
literals that aren't strings. Its literals are either 4, 8, or 16 bytes
in length. LLD-ELF dedups its literals via sorting + uniq'ing, but since
we don't need to worry about overly-long values, we should be able to do
a faster job by just hashing.
That said, the implementation right now is far from optimal, because we
add to those hashtables serially. To parallelize this, we'll need a
basic concurrent hashtable (only needs to support concurrent writes w/o
interleave reads), which shouldn't be to hard to implement, but I'd like
to punt on it for now.
Numbers for linking chromium_framework on my 3.2 GHz 16-Core Intel Xeon W:
N Min Max Median Avg Stddev
x 20 4.27 4.39 4.315 4.3225 0.033225703
+ 20 4.36 4.82 4.44 4.4845 0.13152846
Difference at 95.0% confidence
0.162 +/- 0.0613971
3.74783% +/- 1.42041%
(Student's t, pooled s = 0.0959262)
This corresponds to binary size savings of 2MB out of 335MB, or 0.6%.
It's not a great tradeoff as-is, but as mentioned our implementation can
be signficantly optimized, and literal dedup will unlock more
opportunities for ICF to identify identical structures that reference
the same literals.
Reviewed By: #lld-macho, gkm
Differential Revision: https://reviews.llvm.org/D103113
2021-06-12 07:49:50 +08:00
|
|
|
}
|
[lld-macho] Move ICF earlier to avoid emitting redundant binds
This is a pretty big refactoring diff, so here are the motivations:
Previously, ICF ran after scanRelocations(), where we emitting
bind/rebase opcodes etc. So we had a bunch of redundant leftovers after
ICF. Having ICF run before Writer seems like a better design, and is
what LLD-ELF does, so this diff refactors it accordingly.
However, ICF had two dependencies on things occurring in Writer: 1) it
needs literals to be deduplicated beforehand and 2) it needs to know
which functions have unwind info, which was being handled by
`UnwindInfoSection::prepareRelocations()`.
In order to do literal deduplication earlier, we need to add literal
input sections to their corresponding output sections. So instead of
putting all input sections into the big `inputSections` vector, and then
filtering them by type later on, I've changed things so that literal
sections get added directly to their output sections during the 'gather'
phase. Likewise for compact unwind sections -- they get added directly
to the UnwindInfoSection now. This latter change is not strictly
necessary, but makes it easier for ICF to determine which functions have
unwind info.
Adding literal sections directly to their output sections means that we
can no longer determine `inputOrder` from iterating over
`inputSections`. Instead, we store that order explicitly on
InputSection. Bloating the size of InputSection for this purpose would
be unfortunate -- but LLD-ELF has already solved this problem: it reuses
`outSecOff` to store this order value.
One downside of this refactor is that we now make an additional pass
over the unwind info relocations to figure out which functions have
unwind info, since want to know that before `processRelocations()`. I've
made sure to run that extra loop only if ICF is enabled, so there should
be no overhead in non-optimizing runs of the linker.
The upside of all this is that the `inputSections` vector now contains
only ConcatInputSections that are destined for ConcatOutputSections, so
we can clean up a bunch of code that just existed to filter out other
elements from that vector.
I will test for the lack of redundant binds/rebases in the upcoming
cfstring deduplication diff. While binds/rebases can also happen in the
regular `.text` section, they're more common in `.data` sections, so it
seems more natural to test it that way.
This change is perf-neutral when linking chromium_framework.
Reviewed By: oontvoo
Differential Revision: https://reviews.llvm.org/D105044
2021-07-02 08:33:42 +08:00
|
|
|
case S_16BYTE_LITERALS: {
|
|
|
|
for (size_t off = 0, e = isec->data.size(); off < e; off += 16) {
|
|
|
|
if (!isec->isLive(off))
|
|
|
|
continue;
|
|
|
|
UInt128 value = *reinterpret_cast<const UInt128 *>(buf + off);
|
|
|
|
literal16Map.emplace(value, literal16Map.size());
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
llvm_unreachable("invalid literal section type");
|
[lld-macho] Deduplicate fixed-width literals
Conceptually, the implementation is pretty straightforward: we put each
literal value into a hashtable, and then write out the keys of that
hashtable at the end.
In contrast with ELF, the Mach-O format does not support variable-length
literals that aren't strings. Its literals are either 4, 8, or 16 bytes
in length. LLD-ELF dedups its literals via sorting + uniq'ing, but since
we don't need to worry about overly-long values, we should be able to do
a faster job by just hashing.
That said, the implementation right now is far from optimal, because we
add to those hashtables serially. To parallelize this, we'll need a
basic concurrent hashtable (only needs to support concurrent writes w/o
interleave reads), which shouldn't be to hard to implement, but I'd like
to punt on it for now.
Numbers for linking chromium_framework on my 3.2 GHz 16-Core Intel Xeon W:
N Min Max Median Avg Stddev
x 20 4.27 4.39 4.315 4.3225 0.033225703
+ 20 4.36 4.82 4.44 4.4845 0.13152846
Difference at 95.0% confidence
0.162 +/- 0.0613971
3.74783% +/- 1.42041%
(Student's t, pooled s = 0.0959262)
This corresponds to binary size savings of 2MB out of 335MB, or 0.6%.
It's not a great tradeoff as-is, but as mentioned our implementation can
be signficantly optimized, and literal dedup will unlock more
opportunities for ICF to identify identical structures that reference
the same literals.
Reviewed By: #lld-macho, gkm
Differential Revision: https://reviews.llvm.org/D103113
2021-06-12 07:49:50 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void WordLiteralSection::writeTo(uint8_t *buf) const {
|
|
|
|
// Note that we don't attempt to do any endianness conversion in addInput(),
|
|
|
|
// so we don't do it here either -- just write out the original value,
|
|
|
|
// byte-for-byte.
|
|
|
|
for (const auto &p : literal16Map)
|
|
|
|
memcpy(buf + p.second * 16, &p.first, 16);
|
|
|
|
buf += literal16Map.size() * 16;
|
|
|
|
|
|
|
|
for (const auto &p : literal8Map)
|
|
|
|
memcpy(buf + p.second * 8, &p.first, 8);
|
|
|
|
buf += literal8Map.size() * 8;
|
|
|
|
|
|
|
|
for (const auto &p : literal4Map)
|
|
|
|
memcpy(buf + p.second * 4, &p.first, 4);
|
|
|
|
}
|
|
|
|
|
2021-03-19 06:49:45 +08:00
|
|
|
void macho::createSyntheticSymbols() {
|
|
|
|
auto addHeaderSymbol = [](const char *name) {
|
2021-05-17 21:15:39 +08:00
|
|
|
symtab->addSynthetic(name, in.header->isec, /*value=*/0,
|
2022-01-10 04:21:02 +08:00
|
|
|
/*isPrivateExtern=*/true, /*includeInSymtab=*/false,
|
2021-05-17 21:15:39 +08:00
|
|
|
/*referencedDynamically=*/false);
|
2021-03-19 06:49:45 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
switch (config->outputType) {
|
2021-03-20 02:17:51 +08:00
|
|
|
// FIXME: Assign the right address value for these symbols
|
2021-03-19 06:49:45 +08:00
|
|
|
// (rather than 0). But we need to do that after assignAddresses().
|
|
|
|
case MH_EXECUTE:
|
|
|
|
// If linking PIE, __mh_execute_header is a defined symbol in
|
|
|
|
// __TEXT, __text)
|
|
|
|
// Otherwise, it's an absolute symbol.
|
|
|
|
if (config->isPic)
|
2021-03-30 08:33:48 +08:00
|
|
|
symtab->addSynthetic("__mh_execute_header", in.header->isec, /*value=*/0,
|
2022-01-10 04:21:02 +08:00
|
|
|
/*isPrivateExtern=*/false, /*includeInSymtab=*/true,
|
2021-05-17 21:15:39 +08:00
|
|
|
/*referencedDynamically=*/true);
|
2021-03-19 06:49:45 +08:00
|
|
|
else
|
2021-05-17 21:15:39 +08:00
|
|
|
symtab->addSynthetic("__mh_execute_header", /*isec=*/nullptr, /*value=*/0,
|
2022-01-10 04:21:02 +08:00
|
|
|
/*isPrivateExtern=*/false, /*includeInSymtab=*/true,
|
2021-05-17 21:15:39 +08:00
|
|
|
/*referencedDynamically=*/true);
|
2021-03-19 06:49:45 +08:00
|
|
|
break;
|
|
|
|
|
2021-05-17 21:15:39 +08:00
|
|
|
// The following symbols are N_SECT symbols, even though the header is not
|
2021-03-19 06:49:45 +08:00
|
|
|
// part of any section and that they are private to the bundle/dylib/object
|
|
|
|
// they are part of.
|
|
|
|
case MH_BUNDLE:
|
|
|
|
addHeaderSymbol("__mh_bundle_header");
|
|
|
|
break;
|
|
|
|
case MH_DYLIB:
|
|
|
|
addHeaderSymbol("__mh_dylib_header");
|
|
|
|
break;
|
|
|
|
case MH_DYLINKER:
|
|
|
|
addHeaderSymbol("__mh_dylinker_header");
|
|
|
|
break;
|
|
|
|
case MH_OBJECT:
|
|
|
|
addHeaderSymbol("__mh_object_header");
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("unexpected outputType");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The Itanium C++ ABI requires dylibs to pass a pointer to __cxa_atexit
|
|
|
|
// which does e.g. cleanup of static global variables. The ABI document
|
|
|
|
// says that the pointer can point to any address in one of the dylib's
|
|
|
|
// segments, but in practice ld64 seems to set it to point to the header,
|
|
|
|
// so that's what's implemented here.
|
|
|
|
addHeaderSymbol("___dso_handle");
|
|
|
|
}
|
2021-04-03 06:46:18 +08:00
|
|
|
|
|
|
|
template SymtabSection *macho::makeSymtabSection<LP64>(StringTableSection &);
|
|
|
|
template SymtabSection *macho::makeSymtabSection<ILP32>(StringTableSection &);
|