2020-04-03 02:54:05 +08:00
|
|
|
//===- InputFiles.h ---------------------------------------------*- C++ -*-===//
|
|
|
|
//
|
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#ifndef LLD_MACHO_INPUT_FILES_H
|
|
|
|
#define LLD_MACHO_INPUT_FILES_H
|
|
|
|
|
2020-05-22 06:26:35 +08:00
|
|
|
#include "MachOStructs.h"
|
|
|
|
|
2020-04-03 02:54:05 +08:00
|
|
|
#include "lld/Common/LLVM.h"
|
2020-08-19 06:46:21 +08:00
|
|
|
#include "lld/Common/Memory.h"
|
2020-04-03 02:54:05 +08:00
|
|
|
#include "llvm/ADT/DenseSet.h"
|
2020-12-15 06:59:22 +08:00
|
|
|
#include "llvm/ADT/SetVector.h"
|
2020-04-03 02:54:05 +08:00
|
|
|
#include "llvm/BinaryFormat/MachO.h"
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
|
2020-04-03 02:54:05 +08:00
|
|
|
#include "llvm/Object/Archive.h"
|
|
|
|
#include "llvm/Support/MemoryBuffer.h"
|
2020-06-06 02:18:33 +08:00
|
|
|
#include "llvm/TextAPI/MachO/TextAPIReader.h"
|
[lld-macho][re-land] Support .subsections_via_symbols
Summary:
This diff restores and builds upon @pcc and @ruiu's initial work on
subsections.
The .subsections_via_symbols directive indicates we can split each
section along symbol boundaries, unless those symbols have been marked
with `.alt_entry`.
We exercise this functionality in our tests by using order files that
rearrange those symbols.
Depends on D79668.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Reviewed By: smeenai
Subscribers: thakis, llvm-commits, pcc, ruiu
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D79926
2020-05-19 23:46:07 +08:00
|
|
|
|
|
|
|
#include <map>
|
2020-04-03 02:54:05 +08:00
|
|
|
#include <vector>
|
|
|
|
|
2020-10-27 10:18:29 +08:00
|
|
|
namespace llvm {
|
|
|
|
namespace lto {
|
|
|
|
class InputFile;
|
|
|
|
} // namespace lto
|
2021-03-12 02:28:08 +08:00
|
|
|
namespace MachO {
|
|
|
|
class InterfaceFile;
|
|
|
|
} // namespace MachO
|
2020-11-29 11:38:27 +08:00
|
|
|
class TarWriter;
|
2020-10-27 10:18:29 +08:00
|
|
|
} // namespace llvm
|
|
|
|
|
2020-04-03 02:54:05 +08:00
|
|
|
namespace lld {
|
|
|
|
namespace macho {
|
|
|
|
|
|
|
|
class InputSection;
|
|
|
|
class Symbol;
|
|
|
|
struct Reloc;
|
2020-12-17 08:14:57 +08:00
|
|
|
enum class RefState : uint8_t;
|
2020-04-03 02:54:05 +08:00
|
|
|
|
2020-11-29 11:38:27 +08:00
|
|
|
// If --reproduce option is given, all input files are written
|
|
|
|
// to this tar archive.
|
|
|
|
extern std::unique_ptr<llvm::TarWriter> tar;
|
|
|
|
|
[lld-macho][re-land] Support .subsections_via_symbols
Summary:
This diff restores and builds upon @pcc and @ruiu's initial work on
subsections.
The .subsections_via_symbols directive indicates we can split each
section along symbol boundaries, unless those symbols have been marked
with `.alt_entry`.
We exercise this functionality in our tests by using order files that
rearrange those symbols.
Depends on D79668.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Reviewed By: smeenai
Subscribers: thakis, llvm-commits, pcc, ruiu
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D79926
2020-05-19 23:46:07 +08:00
|
|
|
// If .subsections_via_symbols is set, each InputSection will be split along
|
|
|
|
// symbol boundaries. The keys of a SubsectionMap represent the offsets of
|
|
|
|
// each subsection from the start of the original pre-split InputSection.
|
|
|
|
using SubsectionMap = std::map<uint32_t, InputSection *>;
|
|
|
|
|
2020-04-03 02:54:05 +08:00
|
|
|
class InputFile {
|
|
|
|
public:
|
|
|
|
enum Kind {
|
|
|
|
ObjKind,
|
2020-10-27 10:18:29 +08:00
|
|
|
OpaqueKind,
|
2020-04-22 04:37:57 +08:00
|
|
|
DylibKind,
|
2020-05-15 03:43:51 +08:00
|
|
|
ArchiveKind,
|
2020-10-27 10:18:29 +08:00
|
|
|
BitcodeKind,
|
2020-04-03 02:54:05 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
virtual ~InputFile() = default;
|
|
|
|
Kind kind() const { return fileKind; }
|
2020-08-19 06:46:21 +08:00
|
|
|
StringRef getName() const { return name; }
|
2020-04-03 02:54:05 +08:00
|
|
|
|
|
|
|
MemoryBufferRef mb;
|
2020-12-02 08:00:48 +08:00
|
|
|
|
2020-04-03 02:54:05 +08:00
|
|
|
std::vector<Symbol *> symbols;
|
[lld-macho][re-land] Support .subsections_via_symbols
Summary:
This diff restores and builds upon @pcc and @ruiu's initial work on
subsections.
The .subsections_via_symbols directive indicates we can split each
section along symbol boundaries, unless those symbols have been marked
with `.alt_entry`.
We exercise this functionality in our tests by using order files that
rearrange those symbols.
Depends on D79668.
Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee
Reviewed By: smeenai
Subscribers: thakis, llvm-commits, pcc, ruiu
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D79926
2020-05-19 23:46:07 +08:00
|
|
|
std::vector<SubsectionMap> subsections;
|
2020-12-02 06:45:12 +08:00
|
|
|
// Provides an easy way to sort InputFiles deterministically.
|
|
|
|
const int id;
|
2020-04-03 02:54:05 +08:00
|
|
|
|
2020-12-02 08:00:48 +08:00
|
|
|
// If not empty, this stores the name of the archive containing this file.
|
|
|
|
// We use this string for creating error messages.
|
|
|
|
std::string archiveName;
|
|
|
|
|
2020-04-03 02:54:05 +08:00
|
|
|
protected:
|
2020-08-19 06:46:21 +08:00
|
|
|
InputFile(Kind kind, MemoryBufferRef mb)
|
2020-12-02 06:45:12 +08:00
|
|
|
: mb(mb), id(idCount++), fileKind(kind), name(mb.getBufferIdentifier()) {}
|
2020-08-19 06:46:21 +08:00
|
|
|
|
2021-03-12 02:28:08 +08:00
|
|
|
InputFile(Kind, const llvm::MachO::InterfaceFile &);
|
2020-04-03 02:54:05 +08:00
|
|
|
|
|
|
|
private:
|
|
|
|
const Kind fileKind;
|
2020-08-19 06:46:21 +08:00
|
|
|
const StringRef name;
|
2020-12-02 06:45:12 +08:00
|
|
|
|
|
|
|
static int idCount;
|
2020-04-03 02:54:05 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
// .o file
|
|
|
|
class ObjFile : public InputFile {
|
|
|
|
public:
|
2020-12-02 11:57:37 +08:00
|
|
|
ObjFile(MemoryBufferRef mb, uint32_t modTime, StringRef archiveName);
|
2020-04-03 02:54:05 +08:00
|
|
|
static bool classof(const InputFile *f) { return f->kind() == ObjKind; }
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
|
|
|
|
llvm::DWARFUnit *compileUnit = nullptr;
|
2020-12-02 06:45:11 +08:00
|
|
|
const uint32_t modTime;
|
2020-12-02 11:57:37 +08:00
|
|
|
ArrayRef<llvm::MachO::section_64> sectionHeaders;
|
2020-12-09 09:47:19 +08:00
|
|
|
std::vector<InputSection *> debugSections;
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
|
|
|
|
private:
|
2020-12-02 11:57:37 +08:00
|
|
|
void parseSections(ArrayRef<llvm::MachO::section_64>);
|
|
|
|
void parseSymbols(ArrayRef<lld::structs::nlist_64> nList, const char *strtab,
|
|
|
|
bool subsectionsViaSymbols);
|
|
|
|
Symbol *parseNonSectionSymbol(const structs::nlist_64 &sym, StringRef name);
|
|
|
|
void parseRelocations(const llvm::MachO::section_64 &, SubsectionMap &);
|
[lld-macho] Emit STABS symbols for debugging, and drop debug sections
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.
With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.
Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.
Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:
1. We can split up subsections by symbol even if `.subsections_with_symbols`
is not set, but include constraints to ensure those subsections retain
their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
and I'm more inclined toward it, but I'm not sure if there are use cases
that it doesn't handle well. As such I'm punting on the decision for now.
Reviewed By: clayborg
Differential Revision: https://reviews.llvm.org/D89257
2020-12-02 06:45:01 +08:00
|
|
|
void parseDebugInfo();
|
2020-04-03 02:54:05 +08:00
|
|
|
};
|
|
|
|
|
2020-08-11 09:47:13 +08:00
|
|
|
// command-line -sectcreate file
|
|
|
|
class OpaqueFile : public InputFile {
|
|
|
|
public:
|
2020-12-02 11:57:37 +08:00
|
|
|
OpaqueFile(MemoryBufferRef mb, StringRef segName, StringRef sectName);
|
2020-08-11 09:47:13 +08:00
|
|
|
static bool classof(const InputFile *f) { return f->kind() == OpaqueKind; }
|
|
|
|
};
|
|
|
|
|
2020-04-22 04:37:57 +08:00
|
|
|
// .dylib file
|
|
|
|
class DylibFile : public InputFile {
|
|
|
|
public:
|
2020-04-24 11:16:49 +08:00
|
|
|
// Mach-O dylibs can re-export other dylibs as sub-libraries, meaning that the
|
|
|
|
// symbols in those sub-libraries will be available under the umbrella
|
|
|
|
// library's namespace. Those sub-libraries can also have their own
|
|
|
|
// re-exports. When loading a re-exported dylib, `umbrella` should be set to
|
|
|
|
// the root dylib to ensure symbols in the child library are correctly bound
|
|
|
|
// to the root. On the other hand, if a dylib is being directly loaded
|
|
|
|
// (through an -lfoo flag), then `umbrella` should be a nullptr.
|
2021-02-23 02:03:02 +08:00
|
|
|
explicit DylibFile(MemoryBufferRef mb, DylibFile *umbrella = nullptr,
|
|
|
|
bool isBundleLoader = false);
|
2020-06-06 02:18:33 +08:00
|
|
|
|
2020-08-19 06:46:21 +08:00
|
|
|
explicit DylibFile(const llvm::MachO::InterfaceFile &interface,
|
2021-02-23 02:03:02 +08:00
|
|
|
DylibFile *umbrella = nullptr,
|
|
|
|
bool isBundleLoader = false);
|
2020-08-19 06:46:21 +08:00
|
|
|
|
2020-04-22 04:37:57 +08:00
|
|
|
static bool classof(const InputFile *f) { return f->kind() == DylibKind; }
|
|
|
|
|
|
|
|
StringRef dylibName;
|
2020-12-16 04:25:15 +08:00
|
|
|
uint32_t compatibilityVersion = 0;
|
|
|
|
uint32_t currentVersion = 0;
|
2021-02-23 02:03:02 +08:00
|
|
|
int64_t ordinal = 0; // Ordinal numbering starts from 1, so 0 is a sentinel
|
2020-12-17 08:14:57 +08:00
|
|
|
RefState refState;
|
2020-04-24 11:16:49 +08:00
|
|
|
bool reexport = false;
|
2020-09-19 02:38:15 +08:00
|
|
|
bool forceWeakImport = false;
|
2021-02-23 02:03:02 +08:00
|
|
|
|
|
|
|
// An executable can be used as a bundle loader that will load the output
|
|
|
|
// file being linked, and that contains symbols referenced, but not
|
|
|
|
// implemented in the bundle. When used like this, it is very similar
|
|
|
|
// to a Dylib, so we re-used the same class to represent it.
|
|
|
|
bool isBundleLoader;
|
2020-04-22 04:37:57 +08:00
|
|
|
};
|
|
|
|
|
2020-05-15 03:43:51 +08:00
|
|
|
// .a file
|
|
|
|
class ArchiveFile : public InputFile {
|
|
|
|
public:
|
|
|
|
explicit ArchiveFile(std::unique_ptr<llvm::object::Archive> &&file);
|
|
|
|
static bool classof(const InputFile *f) { return f->kind() == ArchiveKind; }
|
|
|
|
void fetch(const llvm::object::Archive::Symbol &sym);
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::unique_ptr<llvm::object::Archive> file;
|
|
|
|
// Keep track of children fetched from the archive by tracking
|
|
|
|
// which address offsets have been fetched already.
|
|
|
|
llvm::DenseSet<uint64_t> seen;
|
|
|
|
};
|
|
|
|
|
2020-10-27 10:18:29 +08:00
|
|
|
class BitcodeFile : public InputFile {
|
|
|
|
public:
|
|
|
|
explicit BitcodeFile(MemoryBufferRef mb);
|
|
|
|
static bool classof(const InputFile *f) { return f->kind() == BitcodeKind; }
|
|
|
|
|
|
|
|
std::unique_ptr<llvm::lto::InputFile> obj;
|
|
|
|
};
|
|
|
|
|
2020-12-15 06:59:22 +08:00
|
|
|
extern llvm::SetVector<InputFile *> inputFiles;
|
2020-04-03 02:54:05 +08:00
|
|
|
|
2021-03-02 17:20:22 +08:00
|
|
|
llvm::Optional<MemoryBufferRef> readFile(StringRef path);
|
2020-04-03 02:54:05 +08:00
|
|
|
|
2021-03-05 05:58:21 +08:00
|
|
|
template <class CommandType = llvm::MachO::load_command>
|
|
|
|
const CommandType *findCommand(const llvm::MachO::mach_header_64 *hdr,
|
|
|
|
uint32_t type) {
|
|
|
|
const uint8_t *p = reinterpret_cast<const uint8_t *>(hdr) +
|
|
|
|
sizeof(llvm::MachO::mach_header_64);
|
|
|
|
|
|
|
|
for (uint32_t i = 0, n = hdr->ncmds; i < n; ++i) {
|
|
|
|
auto *cmd = reinterpret_cast<const CommandType *>(p);
|
|
|
|
if (cmd->cmd == type)
|
|
|
|
return cmd;
|
|
|
|
p += cmd->cmdsize;
|
|
|
|
}
|
|
|
|
return nullptr;
|
|
|
|
}
|
2020-08-19 05:37:04 +08:00
|
|
|
|
2020-04-03 02:54:05 +08:00
|
|
|
} // namespace macho
|
|
|
|
|
|
|
|
std::string toString(const macho::InputFile *file);
|
|
|
|
} // namespace lld
|
|
|
|
|
|
|
|
#endif
|