2020-04-03 02:54:05 +08:00
|
|
|
//===- InputFiles.cpp -----------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This file contains functions to parse Mach-O object files. In this comment,
|
|
|
|
// we describe the Mach-O file structure and how we parse it.
|
|
|
|
//
|
|
|
|
// Mach-O is not very different from ELF or COFF. The notion of symbols,
|
|
|
|
// sections and relocations exists in Mach-O as it does in ELF and COFF.
|
|
|
|
//
|
|
|
|
// Perhaps the notion that is new to those who know ELF/COFF is "subsections".
|
|
|
|
// In ELF/COFF, sections are an atomic unit of data copied from input files to
|
|
|
|
// output files. When we merge or garbage-collect sections, we treat each
|
|
|
|
// section as an atomic unit. In Mach-O, that's not the case. Sections can
|
|
|
|
// consist of multiple subsections, and subsections are a unit of merging and
|
|
|
|
// garbage-collecting. Therefore, Mach-O's subsections are more similar to
|
|
|
|
// ELF/COFF's sections than Mach-O's sections are.
|
|
|
|
//
|
|
|
|
// A section can have multiple symbols. A symbol that does not have the
|
|
|
|
// N_ALT_ENTRY attribute indicates a beginning of a subsection. Therefore, by
|
|
|
|
// definition, a symbol is always present at the beginning of each subsection. A
|
|
|
|
// symbol with N_ALT_ENTRY attribute does not start a new subsection and can
|
|
|
|
// point to a middle of a subsection.
|
|
|
|
//
|
|
|
|
// The notion of subsections also affects how relocations are represented in
|
|
|
|
// Mach-O. All references within a section need to be explicitly represented as
|
|
|
|
// relocations if they refer to different subsections, because we obviously need
|
|
|
|
// to fix up addresses if subsections are laid out in an output file differently
|
|
|
|
// than they were in object files. To represent that, Mach-O relocations can
|
|
|
|
// refer to an unnamed location via its address. Scattered relocations (those
|
|
|
|
// with the R_SCATTERED bit set) always refer to unnamed locations.
|
|
|
|
// Non-scattered relocations refer to an unnamed location if r_extern is not set
|
|
|
|
// and r_symbolnum is zero.
|
|
|
|
//
|
|
|
|
// Without the above differences, I think you can use your knowledge about ELF
|
|
|
|
// and COFF for Mach-O.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "InputFiles.h"
|
2020-04-24 11:16:49 +08:00
|
|
|
#include "Config.h"
|
[lld-macho] Use export trie instead of symtab when linking against dylibs
Summary:
This allows us to link against stripped dylibs. Moreover, it's simply
more correct: The symbol table includes symbols that the dylib uses but
doesn't export.
This temporarily regresses our ability to do lazy symbol binding because
dyld_stub_binder isn't in libSystem's export trie. Rather, it is in one
of the sub-libraries libSystem re-exports. (This doesn't affect our
tests since we are mocking out dyld_stub_binder there.) A follow-up diff
will address this by adding support for sub-libraries.
Depends on D79114.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: mgorny, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D79226
2020-04-23 11:00:57 +08:00
|
|
|
#include "ExportTrie.h"
|
2020-04-03 02:54:05 +08:00
|
|
|
#include "InputSection.h"
|
2020-05-02 07:29:06 +08:00
|
|
|
#include "OutputSection.h"
|
2020-04-03 02:54:05 +08:00
|
|
|
#include "SymbolTable.h"
|
|
|
|
#include "Symbols.h"
|
|
|
|
#include "Target.h"
|
|
|
|
|
|
|
|
#include "lld/Common/ErrorHandler.h"
|
|
|
|
#include "lld/Common/Memory.h"
|
|
|
|
#include "llvm/BinaryFormat/MachO.h"
|
|
|
|
#include "llvm/Support/Endian.h"
|
|
|
|
#include "llvm/Support/MemoryBuffer.h"
|
2020-04-24 11:16:49 +08:00
|
|
|
#include "llvm/Support/Path.h"
|
2020-04-03 02:54:05 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
using namespace llvm::MachO;
|
|
|
|
using namespace llvm::support::endian;
|
2020-04-24 11:16:49 +08:00
|
|
|
using namespace llvm::sys;
|
2020-04-03 02:54:05 +08:00
|
|
|
using namespace lld;
|
|
|
|
using namespace lld::macho;
|
|
|
|
|
|
|
|
std::vector<InputFile *> macho::inputFiles;
|
|
|
|
|
|
|
|
// Open a given file path and return it as a memory-mapped file.
|
|
|
|
Optional<MemoryBufferRef> macho::readFile(StringRef path) {
|
|
|
|
// Open a file.
|
|
|
|
auto mbOrErr = MemoryBuffer::getFile(path);
|
|
|
|
if (auto ec = mbOrErr.getError()) {
|
|
|
|
error("cannot open " + path + ": " + ec.message());
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::unique_ptr<MemoryBuffer> &mb = *mbOrErr;
|
|
|
|
MemoryBufferRef mbref = mb->getMemBufferRef();
|
|
|
|
make<std::unique_ptr<MemoryBuffer>>(std::move(mb)); // take mb ownership
|
2020-04-22 04:37:57 +08:00
|
|
|
|
|
|
|
// If this is a regular non-fat file, return it.
|
|
|
|
const char *buf = mbref.getBufferStart();
|
|
|
|
auto *hdr = reinterpret_cast<const MachO::fat_header *>(buf);
|
|
|
|
if (read32be(&hdr->magic) != MachO::FAT_MAGIC)
|
|
|
|
return mbref;
|
|
|
|
|
2020-04-30 06:42:36 +08:00
|
|
|
// Object files and archive files may be fat files, which contains
|
|
|
|
// multiple real files for different CPU ISAs. Here, we search for a
|
|
|
|
// file that matches with the current link target and returns it as
|
|
|
|
// a MemoryBufferRef.
|
|
|
|
auto *arch = reinterpret_cast<const MachO::fat_arch *>(buf + sizeof(*hdr));
|
|
|
|
|
|
|
|
for (uint32_t i = 0, n = read32be(&hdr->nfat_arch); i < n; ++i) {
|
|
|
|
if (reinterpret_cast<const char *>(arch + i + 1) >
|
|
|
|
buf + mbref.getBufferSize()) {
|
|
|
|
error(path + ": fat_arch struct extends beyond end of file");
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (read32be(&arch[i].cputype) != target->cpuType ||
|
|
|
|
read32be(&arch[i].cpusubtype) != target->cpuSubtype)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
uint32_t offset = read32be(&arch[i].offset);
|
|
|
|
uint32_t size = read32be(&arch[i].size);
|
|
|
|
if (offset + size > mbref.getBufferSize())
|
|
|
|
error(path + ": slice extends beyond end of file");
|
|
|
|
return MemoryBufferRef(StringRef(buf + offset, size), path.copy(bAlloc));
|
|
|
|
}
|
|
|
|
|
|
|
|
error("unable to find matching architecture in " + path);
|
2020-04-22 04:37:57 +08:00
|
|
|
return None;
|
2020-04-03 02:54:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static const load_command *findCommand(const mach_header_64 *hdr,
|
|
|
|
uint32_t type) {
|
|
|
|
const uint8_t *p =
|
|
|
|
reinterpret_cast<const uint8_t *>(hdr) + sizeof(mach_header_64);
|
|
|
|
|
|
|
|
for (uint32_t i = 0, n = hdr->ncmds; i < n; ++i) {
|
|
|
|
auto *cmd = reinterpret_cast<const load_command *>(p);
|
|
|
|
if (cmd->cmd == type)
|
|
|
|
return cmd;
|
|
|
|
p += cmd->cmdsize;
|
|
|
|
}
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<InputSection *>
|
|
|
|
InputFile::parseSections(ArrayRef<section_64> sections) {
|
|
|
|
std::vector<InputSection *> ret;
|
|
|
|
ret.reserve(sections.size());
|
|
|
|
|
|
|
|
auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
|
|
|
|
|
|
|
|
for (const section_64 &sec : sections) {
|
|
|
|
InputSection *isec = make<InputSection>();
|
|
|
|
isec->file = this;
|
2020-04-28 03:59:20 +08:00
|
|
|
isec->header = &sec;
|
2020-04-03 02:54:05 +08:00
|
|
|
isec->name = StringRef(sec.sectname, strnlen(sec.sectname, 16));
|
|
|
|
isec->segname = StringRef(sec.segname, strnlen(sec.segname, 16));
|
|
|
|
isec->data = {buf + sec.offset, static_cast<size_t>(sec.size)};
|
|
|
|
if (sec.align >= 32)
|
|
|
|
error("alignment " + std::to_string(sec.align) + " of section " +
|
|
|
|
isec->name + " is too large");
|
|
|
|
else
|
|
|
|
isec->align = 1 << sec.align;
|
|
|
|
isec->flags = sec.flags;
|
|
|
|
ret.push_back(isec);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void InputFile::parseRelocations(const section_64 &sec,
|
|
|
|
std::vector<Reloc> &relocs) {
|
|
|
|
auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
|
|
|
|
ArrayRef<any_relocation_info> relInfos(
|
|
|
|
reinterpret_cast<const any_relocation_info *>(buf + sec.reloff),
|
|
|
|
sec.nreloc);
|
|
|
|
|
|
|
|
for (const any_relocation_info &anyRel : relInfos) {
|
|
|
|
Reloc r;
|
|
|
|
if (anyRel.r_word0 & R_SCATTERED) {
|
|
|
|
error("TODO: Scattered relocations not supported");
|
|
|
|
} else {
|
|
|
|
auto rel = reinterpret_cast<const relocation_info &>(anyRel);
|
|
|
|
r.type = rel.r_type;
|
|
|
|
r.offset = rel.r_address;
|
|
|
|
r.addend = target->getImplicitAddend(buf + sec.offset + r.offset, r.type);
|
2020-04-28 03:59:20 +08:00
|
|
|
if (rel.r_extern) {
|
2020-04-03 02:54:05 +08:00
|
|
|
r.target = symbols[rel.r_symbolnum];
|
2020-04-28 03:59:20 +08:00
|
|
|
} else {
|
|
|
|
if (rel.r_symbolnum == 0 || rel.r_symbolnum > sections.size())
|
|
|
|
fatal("invalid section index in relocation for offset " +
|
|
|
|
std::to_string(r.offset) + " in section " + sec.sectname +
|
|
|
|
" of " + getName());
|
|
|
|
r.target = sections[rel.r_symbolnum - 1];
|
2020-04-03 02:54:05 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
relocs.push_back(r);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ObjFile::ObjFile(MemoryBufferRef mb) : InputFile(ObjKind, mb) {
|
|
|
|
auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
|
|
|
|
auto *hdr = reinterpret_cast<const mach_header_64 *>(mb.getBufferStart());
|
|
|
|
ArrayRef<section_64> objSections;
|
|
|
|
|
|
|
|
if (const load_command *cmd = findCommand(hdr, LC_SEGMENT_64)) {
|
|
|
|
auto *c = reinterpret_cast<const segment_command_64 *>(cmd);
|
|
|
|
objSections = ArrayRef<section_64>{
|
|
|
|
reinterpret_cast<const section_64 *>(c + 1), c->nsects};
|
|
|
|
sections = parseSections(objSections);
|
|
|
|
}
|
|
|
|
|
2020-04-22 04:37:57 +08:00
|
|
|
// TODO: Error on missing LC_SYMTAB?
|
2020-04-03 02:54:05 +08:00
|
|
|
if (const load_command *cmd = findCommand(hdr, LC_SYMTAB)) {
|
|
|
|
auto *c = reinterpret_cast<const symtab_command *>(cmd);
|
|
|
|
const char *strtab = reinterpret_cast<const char *>(buf) + c->stroff;
|
|
|
|
ArrayRef<const nlist_64> nList(
|
|
|
|
reinterpret_cast<const nlist_64 *>(buf + c->symoff), c->nsyms);
|
|
|
|
|
|
|
|
symbols.reserve(c->nsyms);
|
|
|
|
|
|
|
|
for (const nlist_64 &sym : nList) {
|
|
|
|
StringRef name = strtab + sym.n_strx;
|
|
|
|
|
|
|
|
// Undefined symbol
|
|
|
|
if (!sym.n_sect) {
|
2020-04-22 04:37:57 +08:00
|
|
|
symbols.push_back(symtab->addUndefined(name));
|
2020-04-03 02:54:05 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
InputSection *isec = sections[sym.n_sect - 1];
|
|
|
|
const section_64 &objSec = objSections[sym.n_sect - 1];
|
|
|
|
uint64_t value = sym.n_value - objSec.addr;
|
|
|
|
|
|
|
|
// Global defined symbol
|
|
|
|
if (sym.n_type & N_EXT) {
|
|
|
|
symbols.push_back(symtab->addDefined(name, isec, value));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Local defined symbol
|
|
|
|
symbols.push_back(make<Defined>(name, isec, value));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// The relocations may refer to the symbols, so we parse them after we have
|
|
|
|
// the symbols loaded.
|
|
|
|
if (!sections.empty()) {
|
|
|
|
auto it = sections.begin();
|
|
|
|
for (const section_64 &sec : objSections) {
|
|
|
|
parseRelocations(sec, (*it)->relocs);
|
|
|
|
++it;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-24 11:16:49 +08:00
|
|
|
DylibFile::DylibFile(MemoryBufferRef mb, DylibFile *umbrella)
|
|
|
|
: InputFile(DylibKind, mb) {
|
|
|
|
if (umbrella == nullptr)
|
|
|
|
umbrella = this;
|
|
|
|
|
2020-04-22 04:37:57 +08:00
|
|
|
auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
|
|
|
|
auto *hdr = reinterpret_cast<const mach_header_64 *>(mb.getBufferStart());
|
|
|
|
|
|
|
|
// Initialize dylibName.
|
|
|
|
if (const load_command *cmd = findCommand(hdr, LC_ID_DYLIB)) {
|
|
|
|
auto *c = reinterpret_cast<const dylib_command *>(cmd);
|
|
|
|
dylibName = reinterpret_cast<const char *>(cmd) + read32le(&c->dylib.name);
|
|
|
|
} else {
|
|
|
|
error("dylib " + getName() + " missing LC_ID_DYLIB load command");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Initialize symbols.
|
[lld-macho] Use export trie instead of symtab when linking against dylibs
Summary:
This allows us to link against stripped dylibs. Moreover, it's simply
more correct: The symbol table includes symbols that the dylib uses but
doesn't export.
This temporarily regresses our ability to do lazy symbol binding because
dyld_stub_binder isn't in libSystem's export trie. Rather, it is in one
of the sub-libraries libSystem re-exports. (This doesn't affect our
tests since we are mocking out dyld_stub_binder there.) A follow-up diff
will address this by adding support for sub-libraries.
Depends on D79114.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: mgorny, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D79226
2020-04-23 11:00:57 +08:00
|
|
|
if (const load_command *cmd = findCommand(hdr, LC_DYLD_INFO_ONLY)) {
|
|
|
|
auto *c = reinterpret_cast<const dyld_info_command *>(cmd);
|
|
|
|
parseTrie(buf + c->export_off, c->export_size,
|
|
|
|
[&](const Twine &name, uint64_t flags) {
|
2020-04-24 11:16:49 +08:00
|
|
|
symbols.push_back(symtab->addDylib(saver.save(name), umbrella));
|
[lld-macho] Use export trie instead of symtab when linking against dylibs
Summary:
This allows us to link against stripped dylibs. Moreover, it's simply
more correct: The symbol table includes symbols that the dylib uses but
doesn't export.
This temporarily regresses our ability to do lazy symbol binding because
dyld_stub_binder isn't in libSystem's export trie. Rather, it is in one
of the sub-libraries libSystem re-exports. (This doesn't affect our
tests since we are mocking out dyld_stub_binder there.) A follow-up diff
will address this by adding support for sub-libraries.
Depends on D79114.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: mgorny, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D79226
2020-04-23 11:00:57 +08:00
|
|
|
});
|
|
|
|
} else {
|
|
|
|
error("LC_DYLD_INFO_ONLY not found in " + getName());
|
2020-04-24 11:16:49 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (hdr->flags & MH_NO_REEXPORTED_DYLIBS)
|
|
|
|
return;
|
|
|
|
|
|
|
|
const uint8_t *p =
|
|
|
|
reinterpret_cast<const uint8_t *>(hdr) + sizeof(mach_header_64);
|
|
|
|
for (uint32_t i = 0, n = hdr->ncmds; i < n; ++i) {
|
|
|
|
auto *cmd = reinterpret_cast<const load_command *>(p);
|
|
|
|
p += cmd->cmdsize;
|
|
|
|
if (cmd->cmd != LC_REEXPORT_DYLIB)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
auto *c = reinterpret_cast<const dylib_command *>(cmd);
|
|
|
|
StringRef reexportPath =
|
|
|
|
reinterpret_cast<const char *>(c) + read32le(&c->dylib.name);
|
|
|
|
// TODO: Expand @loader_path, @executable_path etc in reexportPath
|
|
|
|
Optional<MemoryBufferRef> buffer = readFile(reexportPath);
|
|
|
|
if (!buffer) {
|
|
|
|
error("unable to read re-exported dylib at " + reexportPath);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
reexported.push_back(make<DylibFile>(*buffer, umbrella));
|
2020-04-22 04:37:57 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[lld-macho] Support calls to functions in dylibs
Summary:
This diff implements lazy symbol binding -- very similar to the PLT
mechanism in ELF.
ELF's .plt section is broken up into two sections in Mach-O:
StubsSection and StubHelperSection. Calls to functions in dylibs will
end up calling into StubsSection, which contains indirect jumps to
addresses stored in the LazyPointerSection (the counterpart to ELF's
.plt.got).
Initially, the LazyPointerSection contains addresses that point into one
of the entry points in the middle of the StubHelperSection. The code in
StubHelperSection will push on the stack an offset into the
LazyBindingSection. The push is followed by a jump to the beginning of
the StubHelperSection (similar to PLT0), which then calls into
dyld_stub_binder. dyld_stub_binder is a non-lazily bound symbol, so this
call looks it up in the GOT.
The stub binder will look up the bind opcodes in the LazyBindingSection
at the given offset. The bind opcodes will tell the binder to update the
address in the LazyPointerSection to point to the symbol, so that
subsequent calls don't have to redo the symbol resolution. The binder
will then jump to the resolved symbol.
Depends on D78269.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Subscribers: llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78270
2020-05-06 08:38:10 +08:00
|
|
|
DylibFile::DylibFile() : InputFile(DylibKind, MemoryBufferRef()) {}
|
|
|
|
|
|
|
|
DylibFile *DylibFile::createLibSystemMock() {
|
|
|
|
auto *file = make<DylibFile>();
|
|
|
|
file->mb = MemoryBufferRef("", "/usr/lib/libSystem.B.dylib");
|
|
|
|
file->dylibName = "/usr/lib/libSystem.B.dylib";
|
|
|
|
file->symbols.push_back(symtab->addDylib("dyld_stub_binder", file));
|
|
|
|
return file;
|
|
|
|
}
|
|
|
|
|
2020-04-03 02:54:05 +08:00
|
|
|
// Returns "<internal>" or "baz.o".
|
|
|
|
std::string lld::toString(const InputFile *file) {
|
|
|
|
return file ? std::string(file->getName()) : "<internal>";
|
|
|
|
}
|