[lld-macho] Deduplicate fixed-width literals

Conceptually, the implementation is pretty straightforward: we put each
literal value into a hashtable, and then write out the keys of that
hashtable at the end.

In contrast with ELF, the Mach-O format does not support variable-length
literals that aren't strings. Its literals are either 4, 8, or 16 bytes
in length. LLD-ELF dedups its literals via sorting + uniq'ing, but since
we don't need to worry about overly-long values, we should be able to do
a faster job by just hashing.

That said, the implementation right now is far from optimal, because we
add to those hashtables serially. To parallelize this, we'll need a
basic concurrent hashtable (only needs to support concurrent writes w/o
interleave reads), which shouldn't be to hard to implement, but I'd like
to punt on it for now.

Numbers for linking chromium_framework on my 3.2 GHz 16-Core Intel Xeon W:

      N           Min           Max        Median           Avg        Stddev
  x  20          4.27          4.39         4.315        4.3225   0.033225703
  +  20          4.36          4.82          4.44        4.4845    0.13152846
  Difference at 95.0% confidence
          0.162 +/- 0.0613971
          3.74783% +/- 1.42041%
          (Student's t, pooled s = 0.0959262)

This corresponds to binary size savings of 2MB out of 335MB, or 0.6%.
It's not a great tradeoff as-is, but as mentioned our implementation can
be signficantly optimized, and literal dedup will unlock more
opportunities for ICF to identify identical structures that reference
the same literals.

Reviewed By: #lld-macho, gkm

Differential Revision: https://reviews.llvm.org/D103113
This commit is contained in:
Jez Ng 2021-06-11 19:49:50 -04:00
parent b90f9bea96
commit 5d88f2dd94
8 changed files with 291 additions and 14 deletions

View File

@ -265,16 +265,25 @@ void ObjFile::parseSections(ArrayRef<Section> sections) {
auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
for (const Section &sec : sections) {
if (config->dedupLiterals && sectionType(sec.flags) == S_CSTRING_LITERALS) {
if (config->dedupLiterals &&
(sectionType(sec.flags) == S_CSTRING_LITERALS ||
isWordLiteralSection(sec.flags))) {
if (sec.nreloc)
fatal(toString(this) + " contains relocations in " + sec.segname + "," +
sec.sectname +
", so LLD cannot deduplicate literals. Try re-running without "
"--deduplicate-literals.");
auto *isec = make<CStringInputSection>();
InputSection *isec;
if (sectionType(sec.flags) == S_CSTRING_LITERALS) {
isec = make<CStringInputSection>();
parseSection(this, buf, sec, isec);
isec->splitIntoPieces(); // FIXME: parallelize this?
// FIXME: parallelize this?
cast<CStringInputSection>(isec)->splitIntoPieces();
} else {
isec = make<WordLiteralInputSection>();
parseSection(this, buf, sec, isec);
}
subsections.push_back({{0, isec}});
} else {
auto *isec = make<ConcatInputSection>();

View File

@ -127,6 +127,25 @@ uint64_t CStringInputSection::getOffset(uint64_t off) const {
return piece.outSecOff + addend;
}
uint64_t WordLiteralInputSection::getFileOffset(uint64_t off) const {
return parent->fileOff + getOffset(off);
}
uint64_t WordLiteralInputSection::getOffset(uint64_t off) const {
auto *osec = cast<WordLiteralSection>(parent);
const uint8_t *buf = data.data();
switch (sectionType(flags)) {
case S_4BYTE_LITERALS:
return osec->getLiteral4Offset(buf + off);
case S_8BYTE_LITERALS:
return osec->getLiteral8Offset(buf + off);
case S_16BYTE_LITERALS:
return osec->getLiteral16Offset(buf + off);
default:
llvm_unreachable("invalid literal section type");
}
}
bool macho::isCodeSection(const InputSection *isec) {
uint32_t type = sectionType(isec->flags);
if (type != S_REGULAR && type != S_COALESCED)

View File

@ -28,6 +28,7 @@ public:
enum Kind {
ConcatKind,
CStringLiteralKind,
WordLiteralKind,
};
Kind kind() const { return sectionKind; }
@ -146,6 +147,17 @@ public:
std::vector<StringPiece> pieces;
};
class WordLiteralInputSection : public InputSection {
public:
WordLiteralInputSection() : InputSection(WordLiteralKind) {}
uint64_t getFileOffset(uint64_t off) const override;
uint64_t getOffset(uint64_t off) const override;
static bool classof(const InputSection *isec) {
return isec->kind() == WordLiteralKind;
}
};
inline uint8_t sectionType(uint32_t flags) {
return flags & llvm::MachO::SECTION_TYPE;
}
@ -169,6 +181,12 @@ inline bool isDebugSection(uint32_t flags) {
llvm::MachO::S_ATTR_DEBUG;
}
inline bool isWordLiteralSection(uint32_t flags) {
return sectionType(flags) == llvm::MachO::S_4BYTE_LITERALS ||
sectionType(flags) == llvm::MachO::S_8BYTE_LITERALS ||
sectionType(flags) == llvm::MachO::S_16BYTE_LITERALS;
}
bool isCodeSection(const InputSection *);
extern std::vector<InputSection *> inputSections;
@ -197,6 +215,7 @@ constexpr const char indirectSymbolTable[] = "__ind_sym_tab";
constexpr const char const_[] = "__const";
constexpr const char lazySymbolPtr[] = "__la_symbol_ptr";
constexpr const char lazyBinding[] = "__lazy_binding";
constexpr const char literals[] = "__literals";
constexpr const char moduleInitFunc[] = "__mod_init_func";
constexpr const char moduleTermFunc[] = "__mod_term_func";
constexpr const char nonLazySymbolPtr[] = "__nl_symbol_ptr";

View File

@ -1141,6 +1141,64 @@ void CStringSection::finalize() {
}
}
// This section is actually emitted as __TEXT,__const by ld64, but clang may
// emit input sections of that name, and LLD doesn't currently support mixing
// synthetic and concat-type OutputSections. To work around this, I've given
// our merged-literals section a different name.
WordLiteralSection::WordLiteralSection()
: SyntheticSection(segment_names::text, section_names::literals) {
align = 16;
}
void WordLiteralSection::addInput(WordLiteralInputSection *isec) {
isec->parent = this;
// We do all processing of the InputSection here, so it will be effectively
// finalized.
isec->isFinal = true;
const uint8_t *buf = isec->data.data();
switch (sectionType(isec->flags)) {
case S_4BYTE_LITERALS: {
for (size_t i = 0, e = isec->data.size() / 4; i < e; ++i) {
uint32_t value = *reinterpret_cast<const uint32_t *>(buf + i * 4);
literal4Map.emplace(value, literal4Map.size());
}
break;
}
case S_8BYTE_LITERALS: {
for (size_t i = 0, e = isec->data.size() / 8; i < e; ++i) {
uint64_t value = *reinterpret_cast<const uint64_t *>(buf + i * 8);
literal8Map.emplace(value, literal8Map.size());
}
break;
}
case S_16BYTE_LITERALS: {
for (size_t i = 0, e = isec->data.size() / 16; i < e; ++i) {
UInt128 value = *reinterpret_cast<const UInt128 *>(buf + i * 16);
literal16Map.emplace(value, literal16Map.size());
}
break;
}
default:
llvm_unreachable("invalid literal section type");
}
}
void WordLiteralSection::writeTo(uint8_t *buf) const {
// Note that we don't attempt to do any endianness conversion in addInput(),
// so we don't do it here either -- just write out the original value,
// byte-for-byte.
for (const auto &p : literal16Map)
memcpy(buf + p.second * 16, &p.first, 16);
buf += literal16Map.size() * 16;
for (const auto &p : literal8Map)
memcpy(buf + p.second * 8, &p.first, 8);
buf += literal8Map.size() * 8;
for (const auto &p : literal4Map)
memcpy(buf + p.second * 4, &p.first, 4);
}
void macho::createSyntheticSymbols() {
auto addHeaderSymbol = [](const char *name) {
symtab->addSynthetic(name, in.header->isec, /*value=*/0,

View File

@ -22,6 +22,8 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <unordered_map>
namespace llvm {
class DWARFUnit;
} // namespace llvm
@ -297,6 +299,7 @@ public:
// have a corresponding entry in the LazyPointerSection.
bool addEntry(Symbol *);
uint64_t getVA(uint32_t stubsIndex) const {
assert(isFinal || target->usesThunks());
// ConcatOutputSection::finalize() can seek the address of a
// stub before its address is assigned. Before __stubs is
// finalized, return a contrived out-of-range address.
@ -531,9 +534,61 @@ private:
llvm::StringTableBuilder builder;
};
/*
* This section contains deduplicated literal values. The 16-byte values are
* laid out first, followed by the 8- and then the 4-byte ones.
*/
class WordLiteralSection : public SyntheticSection {
public:
using UInt128 = std::pair<uint64_t, uint64_t>;
// I don't think the standard guarantees the size of a pair, so let's make
// sure it's exact -- that way we can construct it via `mmap`.
static_assert(sizeof(UInt128) == 16, "");
WordLiteralSection();
void addInput(WordLiteralInputSection *);
void writeTo(uint8_t *buf) const override;
uint64_t getSize() const override {
return literal16Map.size() * 16 + literal8Map.size() * 8 +
literal4Map.size() * 4;
}
bool isNeeded() const override {
return !literal16Map.empty() || !literal4Map.empty() ||
!literal8Map.empty();
}
uint64_t getLiteral16Offset(const uint8_t *buf) const {
return literal16Map.at(*reinterpret_cast<const UInt128 *>(buf)) * 16;
}
uint64_t getLiteral8Offset(const uint8_t *buf) const {
return literal16Map.size() * 16 +
literal8Map.at(*reinterpret_cast<const uint64_t *>(buf)) * 8;
}
uint64_t getLiteral4Offset(const uint8_t *buf) const {
return literal16Map.size() * 16 + literal8Map.size() * 8 +
literal4Map.at(*reinterpret_cast<const uint32_t *>(buf)) * 4;
}
private:
template <class T> struct Hasher {
llvm::hash_code operator()(T v) const { return llvm::hash_value(v); }
};
// We're using unordered_map instead of DenseMap here because we need to
// support all possible integer values -- there are no suitable tombstone
// values for DenseMap.
std::unordered_map<UInt128, uint64_t, Hasher<UInt128>> literal16Map;
std::unordered_map<uint64_t, uint64_t> literal8Map;
std::unordered_map<uint32_t, uint64_t> literal4Map;
};
struct InStruct {
MachHeaderSection *header = nullptr;
CStringSection *cStringSection = nullptr;
WordLiteralSection *wordLiteralSection = nullptr;
RebaseSection *rebase = nullptr;
BindingSection *binding = nullptr;
WeakBindingSection *weakBinding = nullptr;

View File

@ -863,19 +863,24 @@ template <class LP> void Writer::createOutputSections() {
InputSection *isec = p.value();
if (isec->shouldOmitFromOutput())
continue;
OutputSection *osec;
if (auto *concatIsec = dyn_cast<ConcatInputSection>(isec)) {
NamePair names = maybeRenameSection({isec->segname, isec->name});
ConcatOutputSection *&osec = concatOutputSections[names];
if (osec == nullptr) {
osec = make<ConcatOutputSection>(names.second);
osec->inputOrder = p.index();
}
osec->addInput(concatIsec);
ConcatOutputSection *&concatOsec = concatOutputSections[names];
if (concatOsec == nullptr)
concatOsec = make<ConcatOutputSection>(names.second);
concatOsec->addInput(concatIsec);
osec = concatOsec;
} else if (auto *cStringIsec = dyn_cast<CStringInputSection>(isec)) {
if (in.cStringSection->inputs.empty())
in.cStringSection->inputOrder = p.index();
in.cStringSection->addInput(cStringIsec);
osec = in.cStringSection;
} else if (auto *litIsec = dyn_cast<WordLiteralInputSection>(isec)) {
in.wordLiteralSection->addInput(litIsec);
osec = in.wordLiteralSection;
} else {
llvm_unreachable("unhandled InputSection type");
}
osec->inputOrder = std::min(osec->inputOrder, static_cast<int>(p.index()));
}
// Once all the inputs are added, we can finalize the output section
@ -1050,6 +1055,8 @@ template <class LP> void macho::writeResult() { Writer().run<LP>(); }
void macho::createSyntheticSections() {
in.header = make<MachHeaderSection>();
in.cStringSection = config->dedupLiterals ? make<CStringSection>() : nullptr;
in.wordLiteralSection =
config->dedupLiterals ? make<WordLiteralSection>() : nullptr;
in.rebase = make<RebaseSection>();
in.binding = make<BindingSection>();
in.weakBinding = make<WeakBindingSection>();

View File

@ -0,0 +1,110 @@
# REQUIRES: x86
# RUN: rm -rf %t; split-file %s %t
# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o
# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/qux.s -o %t/qux.o
# RUN: %lld -dylib --deduplicate-literals %t/test.o %t/qux.o -o %t/test
# RUN: llvm-objdump --macho --section="__TEXT,__literals" --section="__DATA,ptrs" --syms %t/test | FileCheck %s
# RUN: llvm-readobj --section-headers %t/test | FileCheck %s --check-prefix=HEADER
# CHECK: Contents of (__TEXT,__literals) section
# CHECK-NEXT: [[#%.16x,DEADBEEF16:]] ef be ad de ef be ad de ef be ad de ef be ad de
# CHECK-NEXT: [[#%.16x,FEEDFACE16:]] ce fa ed fe ce fa ed fe ce fa ed fe ce fa ed fe
# CHECK-NEXT: [[#%.16x,DEADBEEF8:]] ef be ad de ef be ad de ce fa ed fe ce fa ed fe
# CHECK-NEXT: [[#%.16x,DEADBEEF4:]] ef be ad de ce fa ed fe
# CHECK-NEXT: Contents of (__DATA,ptrs) section
# CHECK-NEXT: 0000000000001000 0x[[#%x,DEADBEEF16]]
# CHECK-NEXT: 0000000000001008 0x[[#%x,DEADBEEF16]]
# CHECK-NEXT: 0000000000001010 0x[[#%x,FEEDFACE16]]
# CHECK-NEXT: 0000000000001018 0x[[#%x,DEADBEEF16]]
# CHECK-NEXT: 0000000000001020 0x[[#%x,DEADBEEF8]]
# CHECK-NEXT: 0000000000001028 0x[[#%x,DEADBEEF8]]
# CHECK-NEXT: 0000000000001030 0x[[#%x,DEADBEEF8 + 8]]
# CHECK-NEXT: 0000000000001038 0x[[#%x,DEADBEEF8]]
# CHECK-NEXT: 0000000000001040 0x[[#%x,DEADBEEF4]]
# CHECK-NEXT: 0000000000001048 0x[[#%x,DEADBEEF4]]
# CHECK-NEXT: 0000000000001050 0x[[#%x,DEADBEEF4 + 4]]
# CHECK-NEXT: 0000000000001058 0x[[#%x,DEADBEEF4]]
## Make sure the symbol addresses are correct too.
# CHECK: SYMBOL TABLE:
# CHECK-DAG: [[#DEADBEEF16]] g O __TEXT,__literals _qux16
# CHECK-DAG: [[#DEADBEEF8]] g O __TEXT,__literals _qux8
# CHECK-DAG: [[#DEADBEEF4]] g O __TEXT,__literals _qux4
## Make sure we set the right alignment and flags.
# HEADER: Name: __literals
# HEADER-NEXT: Segment: __TEXT
# HEADER-NEXT: Address:
# HEADER-NEXT: Size:
# HEADER-NEXT: Offset:
# HEADER-NEXT: Alignment: 4
# HEADER-NEXT: RelocationOffset:
# HEADER-NEXT: RelocationCount: 0
# HEADER-NEXT: Type: Regular
# HEADER-NEXT: Attributes [ (0x0)
# HEADER-NEXT: ]
# HEADER-NEXT: Reserved1: 0x0
# HEADER-NEXT: Reserved2: 0x0
# HEADER-NEXT: Reserved3: 0x0
#--- test.s
.literal4
.p2align 2
L._foo4:
.long 0xdeadbeef
L._bar4:
.long 0xdeadbeef
L._baz4:
.long 0xfeedface
.literal8
L._foo8:
.quad 0xdeadbeefdeadbeef
L._bar8:
.quad 0xdeadbeefdeadbeef
L._baz8:
.quad 0xfeedfacefeedface
.literal16
L._foo16:
.quad 0xdeadbeefdeadbeef
.quad 0xdeadbeefdeadbeef
L._bar16:
.quad 0xdeadbeefdeadbeef
.quad 0xdeadbeefdeadbeef
L._baz16:
.quad 0xfeedfacefeedface
.quad 0xfeedfacefeedface
.section __DATA,ptrs,literal_pointers
.quad L._foo16
.quad L._bar16
.quad L._baz16
.quad _qux16
.quad L._foo8
.quad L._bar8
.quad L._baz8
.quad _qux8
.quad L._foo4
.quad L._bar4
.quad L._baz4
.quad _qux4
#--- qux.s
.globl _qux4, _qux8, _qux16
.literal4
.p2align 2
_qux4:
.long 0xdeadbeef
.literal8
_qux8:
.quad 0xdeadbeefdeadbeef
.literal16
_qux16:
.quad 0xdeadbeefdeadbeef
.quad 0xdeadbeefdeadbeef

View File

@ -11,14 +11,14 @@
; FMA: <_foo>:
; FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; FMA-NEXT: vfmsub213ss 7(%rip), %xmm1, %xmm0
; FMA-NEXT: vfmsub213ss [[#]](%rip), %xmm1, %xmm0
; FMA-NEXT: vfnmadd132ss %xmm1, %xmm1, %xmm0
; FMA-NEXT: retq
; NO-FMA: <_foo>:
; NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
; NO-FMA-NEXT: vmovss 16(%rip), %xmm2
; NO-FMA-NEXT: vmovss [[#]](%rip), %xmm2
; NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0
; NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0
; NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0