2015-09-22 08:01:39 +08:00
|
|
|
//===- InputSection.cpp ---------------------------------------------------===//
|
2015-07-25 05:03:07 +08:00
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2015-07-25 05:03:07 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2015-09-22 08:01:39 +08:00
|
|
|
#include "InputSection.h"
|
2015-09-26 03:24:57 +08:00
|
|
|
#include "Config.h"
|
2015-08-28 07:15:56 +08:00
|
|
|
#include "InputFiles.h"
|
2015-09-22 06:01:00 +08:00
|
|
|
#include "OutputSections.h"
|
2016-11-23 18:07:46 +08:00
|
|
|
#include "Relocations.h"
|
2018-07-18 07:16:02 +08:00
|
|
|
#include "SymbolTable.h"
|
2017-12-10 00:56:18 +08:00
|
|
|
#include "Symbols.h"
|
2016-11-10 17:48:29 +08:00
|
|
|
#include "SyntheticSections.h"
|
2015-09-23 02:19:46 +08:00
|
|
|
#include "Target.h"
|
2022-01-21 03:53:18 +08:00
|
|
|
#include "lld/Common/CommonLinkerContext.h"
|
2017-06-09 04:16:21 +08:00
|
|
|
#include "llvm/Support/Compiler.h"
|
2016-06-24 19:18:44 +08:00
|
|
|
#include "llvm/Support/Compression.h"
|
2016-02-26 05:33:56 +08:00
|
|
|
#include "llvm/Support/Endian.h"
|
2017-10-01 05:28:49 +08:00
|
|
|
#include "llvm/Support/xxhash.h"
|
2018-07-18 07:16:02 +08:00
|
|
|
#include <algorithm>
|
Parallelize uncompress() and splitIntoPieces().
Uncompressing section contents and spliting mergeable section contents
into smaller chunks are heavy tasks. They scan entire section contents
and do CPU-intensive tasks such as uncompressing zlib-compressed data
or computing a hash value for each section piece.
Luckily, these tasks are independent to each other, so we can do that
in parallel_for_each. The number of input sections is large (as opposed
to the number of output sections), so there's a large parallelism here.
Actually the current design to call uncompress() and splitIntoPieces()
in batch was chosen with doing this in mind. Basically what we need to
do here is to replace `for` with `parallel_for_each`.
It seems this patch improves latency significantly if linked programs
contain debug info (which in turn contain lots of mergeable strings.)
For example, the latency to link Clang (debug build) improved by 20% on
my machine as shown below. Note that ld.gold took 19.2 seconds to do
the same thing.
Before:
30801.782712 task-clock (msec) # 3.652 CPUs utilized ( +- 2.59% )
104,084 context-switches # 0.003 M/sec ( +- 1.02% )
5,063 cpu-migrations # 0.164 K/sec ( +- 13.66% )
2,528,130 page-faults # 0.082 M/sec ( +- 0.47% )
85,317,809,130 cycles # 2.770 GHz ( +- 2.62% )
67,352,463,373 stalled-cycles-frontend # 78.94% frontend cycles idle ( +- 3.06% )
<not supported> stalled-cycles-backend
44,295,945,493 instructions # 0.52 insns per cycle
# 1.52 stalled cycles per insn ( +- 0.44% )
8,572,384,877 branches # 278.308 M/sec ( +- 0.66% )
141,806,726 branch-misses # 1.65% of all branches ( +- 0.13% )
8.433424003 seconds time elapsed ( +- 1.20% )
After:
35523.764575 task-clock (msec) # 5.265 CPUs utilized ( +- 2.67% )
159,107 context-switches # 0.004 M/sec ( +- 0.48% )
8,123 cpu-migrations # 0.229 K/sec ( +- 23.34% )
2,372,483 page-faults # 0.067 M/sec ( +- 0.36% )
98,395,342,152 cycles # 2.770 GHz ( +- 2.62% )
79,294,670,125 stalled-cycles-frontend # 80.59% frontend cycles idle ( +- 3.03% )
<not supported> stalled-cycles-backend
46,274,151,813 instructions # 0.47 insns per cycle
# 1.71 stalled cycles per insn ( +- 0.47% )
8,987,621,670 branches # 253.003 M/sec ( +- 0.60% )
148,900,624 branch-misses # 1.66% of all branches ( +- 0.27% )
6.747548004 seconds time elapsed ( +- 0.40% )
llvm-svn: 287946
2016-11-26 04:05:08 +08:00
|
|
|
#include <mutex>
|
2018-07-18 07:16:02 +08:00
|
|
|
#include <vector>
|
2016-02-26 05:33:56 +08:00
|
|
|
|
2015-07-25 05:03:07 +08:00
|
|
|
using namespace llvm;
|
|
|
|
using namespace llvm::ELF;
|
2015-09-22 06:01:00 +08:00
|
|
|
using namespace llvm::object;
|
2016-10-13 06:36:31 +08:00
|
|
|
using namespace llvm::support;
|
2016-02-26 05:33:56 +08:00
|
|
|
using namespace llvm::support::endian;
|
2017-03-31 03:13:47 +08:00
|
|
|
using namespace llvm::sys;
|
2020-05-15 13:18:58 +08:00
|
|
|
using namespace lld;
|
|
|
|
using namespace lld::elf;
|
|
|
|
|
2021-12-23 14:30:07 +08:00
|
|
|
SmallVector<InputSectionBase *, 0> elf::inputSections;
|
2020-05-15 13:18:58 +08:00
|
|
|
DenseSet<std::pair<const Symbol *, uint64_t>> elf::ppc64noTocRelax;
|
2015-07-25 05:03:07 +08:00
|
|
|
|
2016-11-24 02:07:33 +08:00
|
|
|
// Returns a string to construct an error message.
|
2020-05-15 13:18:58 +08:00
|
|
|
std::string lld::toString(const InputSectionBase *sec) {
|
2017-04-29 04:00:09 +08:00
|
|
|
return (toString(sec->file) + ":(" + sec->name + ")").str();
|
2016-11-24 02:07:33 +08:00
|
|
|
}
|
|
|
|
|
2016-09-12 21:13:53 +08:00
|
|
|
template <class ELFT>
|
2017-12-21 10:03:39 +08:00
|
|
|
static ArrayRef<uint8_t> getSectionContents(ObjFile<ELFT> &file,
|
|
|
|
const typename ELFT::Shdr &hdr) {
|
|
|
|
if (hdr.sh_type == SHT_NOBITS)
|
|
|
|
return makeArrayRef<uint8_t>(nullptr, hdr.sh_size);
|
2020-09-09 22:03:53 +08:00
|
|
|
return check(file.getObj().getSectionContents(hdr));
|
2016-09-12 21:13:53 +08:00
|
|
|
}
|
|
|
|
|
2017-02-23 10:28:28 +08:00
|
|
|
InputSectionBase::InputSectionBase(InputFile *file, uint64_t flags,
|
|
|
|
uint32_t type, uint64_t entsize,
|
|
|
|
uint32_t link, uint32_t info,
|
2017-03-09 03:35:29 +08:00
|
|
|
uint32_t alignment, ArrayRef<uint8_t> data,
|
2017-02-23 10:28:28 +08:00
|
|
|
StringRef name, Kind sectionKind)
|
2017-03-09 06:36:28 +08:00
|
|
|
: SectionBase(sectionKind, name, flags, entsize, alignment, type, info,
|
|
|
|
link),
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
file(file), rawData(data) {
|
2017-12-21 09:21:59 +08:00
|
|
|
// In order to reduce memory allocation, we assume that mergeable
|
|
|
|
// sections are smaller than 4 GiB, which is not an unreasonable
|
|
|
|
// assumption as of 2017.
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
if (sectionKind == SectionBase::Merge && rawData.size() > UINT32_MAX)
|
2017-12-21 09:21:59 +08:00
|
|
|
error(toString(this) + ": section too large");
|
|
|
|
|
2016-02-24 08:38:18 +08:00
|
|
|
// The ELF spec states that a value of 0 means the section has
|
2019-10-29 09:41:38 +08:00
|
|
|
// no alignment constraints.
|
2019-04-26 12:07:58 +08:00
|
|
|
uint32_t v = std::max<uint32_t>(alignment, 1);
|
2016-10-07 20:27:45 +08:00
|
|
|
if (!isPowerOf2_64(v))
|
2019-02-27 18:28:23 +08:00
|
|
|
fatal(toString(this) + ": sh_addralign is not a power of 2");
|
2017-03-07 23:11:21 +08:00
|
|
|
this->alignment = v;
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
|
|
|
|
// In ELF, each section can be compressed by zlib, and if compressed,
|
|
|
|
// section name may be mangled by appending "z" (e.g. ".zdebug_info").
|
|
|
|
// If that's the case, demangle section name so that we can handle a
|
|
|
|
// section as if it weren't compressed.
|
|
|
|
if ((flags & SHF_COMPRESSED) || name.startswith(".zdebug")) {
|
|
|
|
if (!zlib::isAvailable())
|
|
|
|
error(toString(file) + ": contains a compressed section, " +
|
|
|
|
"but zlib is not available");
|
2022-02-02 01:53:28 +08:00
|
|
|
invokeELFT(parseCompressedHeader);
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
}
|
2016-02-24 08:23:15 +08:00
|
|
|
}
|
2015-10-20 05:00:02 +08:00
|
|
|
|
2017-06-10 08:38:55 +08:00
|
|
|
// Drop SHF_GROUP bit unless we are producing a re-linkable object file.
|
|
|
|
// SHF_GROUP is a marker that a section belongs to some comdat group.
|
|
|
|
// That flag doesn't make sense in an executable.
|
|
|
|
static uint64_t getFlags(uint64_t flags) {
|
|
|
|
flags &= ~(uint64_t)SHF_INFO_LINK;
|
|
|
|
if (!config->relocatable)
|
|
|
|
flags &= ~(uint64_t)SHF_GROUP;
|
|
|
|
return flags;
|
|
|
|
}
|
|
|
|
|
2016-10-26 08:54:03 +08:00
|
|
|
template <class ELFT>
|
2017-12-21 10:03:39 +08:00
|
|
|
InputSectionBase::InputSectionBase(ObjFile<ELFT> &file,
|
|
|
|
const typename ELFT::Shdr &hdr,
|
2017-02-23 10:28:28 +08:00
|
|
|
StringRef name, Kind sectionKind)
|
2021-10-26 05:23:05 +08:00
|
|
|
: InputSectionBase(&file, getFlags(hdr.sh_flags), hdr.sh_type,
|
|
|
|
hdr.sh_entsize, hdr.sh_link, hdr.sh_info,
|
|
|
|
hdr.sh_addralign, getSectionContents(file, hdr), name,
|
|
|
|
sectionKind) {
|
2017-03-09 03:35:29 +08:00
|
|
|
// We reject object files having insanely large alignments even though
|
|
|
|
// they are allowed by the spec. I think 4GB is a reasonable limitation.
|
|
|
|
// We might want to relax this in the future.
|
2017-12-21 10:03:39 +08:00
|
|
|
if (hdr.sh_addralign > UINT32_MAX)
|
|
|
|
fatal(toString(&file) + ": section sh_addralign is too large");
|
2016-11-01 17:17:50 +08:00
|
|
|
}
|
2016-10-26 08:54:03 +08:00
|
|
|
|
2017-03-08 23:44:30 +08:00
|
|
|
size_t InputSectionBase::getSize() const {
|
2017-02-27 10:56:02 +08:00
|
|
|
if (auto *s = dyn_cast<SyntheticSection>(this))
|
2016-11-10 17:48:29 +08:00
|
|
|
return s->getSize();
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
if (uncompressedSize >= 0)
|
|
|
|
return uncompressedSize;
|
2020-04-07 21:48:18 +08:00
|
|
|
return rawData.size() - bytesDropped;
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void InputSectionBase::uncompress() const {
|
|
|
|
size_t size = uncompressedSize;
|
2019-03-13 04:32:30 +08:00
|
|
|
char *uncompressedBuf;
|
|
|
|
{
|
|
|
|
static std::mutex mu;
|
|
|
|
std::lock_guard<std::mutex> lock(mu);
|
2022-01-21 03:53:18 +08:00
|
|
|
uncompressedBuf = bAlloc().Allocate<char>(size);
|
2019-03-13 04:32:30 +08:00
|
|
|
}
|
2016-11-10 17:48:29 +08:00
|
|
|
|
2019-03-13 04:32:30 +08:00
|
|
|
if (Error e = zlib::uncompress(toStringRef(rawData), uncompressedBuf, size))
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
fatal(toString(this) +
|
|
|
|
": uncompress failed: " + llvm::toString(std::move(e)));
|
2019-03-13 04:32:30 +08:00
|
|
|
rawData = makeArrayRef((uint8_t *)uncompressedBuf, size);
|
|
|
|
uncompressedSize = -1;
|
2016-11-08 22:47:16 +08:00
|
|
|
}
|
|
|
|
|
2021-10-28 00:51:06 +08:00
|
|
|
template <class ELFT> RelsOrRelas<ELFT> InputSectionBase::relsOrRelas() const {
|
|
|
|
if (relSecIdx == 0)
|
|
|
|
return {};
|
|
|
|
RelsOrRelas<ELFT> ret;
|
2021-12-25 09:10:38 +08:00
|
|
|
typename ELFT::Shdr shdr =
|
|
|
|
cast<ELFFileBase>(file)->getELFShdrs<ELFT>()[relSecIdx];
|
2021-10-28 00:51:06 +08:00
|
|
|
if (shdr.sh_type == SHT_REL) {
|
|
|
|
ret.rels = makeArrayRef(reinterpret_cast<const typename ELFT::Rel *>(
|
2021-12-25 09:10:38 +08:00
|
|
|
file->mb.getBufferStart() + shdr.sh_offset),
|
2021-10-28 00:51:06 +08:00
|
|
|
shdr.sh_size / sizeof(typename ELFT::Rel));
|
|
|
|
} else {
|
|
|
|
assert(shdr.sh_type == SHT_RELA);
|
|
|
|
ret.relas = makeArrayRef(reinterpret_cast<const typename ELFT::Rela *>(
|
2021-12-25 09:10:38 +08:00
|
|
|
file->mb.getBufferStart() + shdr.sh_offset),
|
2021-10-28 00:51:06 +08:00
|
|
|
shdr.sh_size / sizeof(typename ELFT::Rela));
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-03-09 06:36:28 +08:00
|
|
|
uint64_t SectionBase::getOffset(uint64_t offset) const {
|
2016-09-08 20:33:41 +08:00
|
|
|
switch (kind()) {
|
2017-03-09 06:36:28 +08:00
|
|
|
case Output: {
|
|
|
|
auto *os = cast<OutputSection>(this);
|
|
|
|
// For output sections we treat offset -1 as the end of the section.
|
|
|
|
return offset == uint64_t(-1) ? os->size : offset;
|
|
|
|
}
|
2015-11-12 00:50:37 +08:00
|
|
|
case Regular:
|
2018-04-20 00:54:30 +08:00
|
|
|
case Synthetic:
|
2021-11-29 08:09:04 +08:00
|
|
|
return cast<InputSection>(this)->outSecOff + offset;
|
2015-11-12 03:54:14 +08:00
|
|
|
case EHFrame:
|
2016-07-21 04:19:58 +08:00
|
|
|
// The file crtbeginT.o has relocations pointing to the start of an empty
|
|
|
|
// .eh_frame that is known to be the first in the link. It does that to
|
|
|
|
// identify the start of the output .eh_frame.
|
|
|
|
return offset;
|
2015-11-12 00:50:37 +08:00
|
|
|
case Merge:
|
2017-03-07 04:23:56 +08:00
|
|
|
const MergeInputSection *ms = cast<MergeInputSection>(this);
|
2017-06-01 04:17:44 +08:00
|
|
|
if (InputSection *isec = ms->getParent())
|
2021-11-29 08:09:04 +08:00
|
|
|
return isec->outSecOff + ms->getParentOffset(offset);
|
2018-04-20 00:05:07 +08:00
|
|
|
return ms->getParentOffset(offset);
|
2015-11-12 00:50:37 +08:00
|
|
|
}
|
2016-03-12 16:31:34 +08:00
|
|
|
llvm_unreachable("invalid section kind");
|
2015-11-12 00:50:37 +08:00
|
|
|
}
|
|
|
|
|
2018-03-24 08:35:11 +08:00
|
|
|
uint64_t SectionBase::getVA(uint64_t offset) const {
|
|
|
|
const OutputSection *out = getOutputSection();
|
|
|
|
return (out ? out->addr : 0) + getOffset(offset);
|
|
|
|
}
|
|
|
|
|
2017-03-09 06:36:28 +08:00
|
|
|
OutputSection *SectionBase::getOutputSection() {
|
2017-06-01 04:17:44 +08:00
|
|
|
InputSection *sec;
|
2017-03-09 06:36:28 +08:00
|
|
|
if (auto *isec = dyn_cast<InputSection>(this))
|
2018-03-24 01:19:18 +08:00
|
|
|
sec = isec;
|
2017-06-01 04:17:44 +08:00
|
|
|
else if (auto *ms = dyn_cast<MergeInputSection>(this))
|
|
|
|
sec = ms->getParent();
|
|
|
|
else if (auto *eh = dyn_cast<EhInputSection>(this))
|
|
|
|
sec = eh->getParent();
|
|
|
|
else
|
|
|
|
return cast<OutputSection>(this);
|
2018-04-20 01:26:50 +08:00
|
|
|
return sec ? sec->getParent() : nullptr;
|
2017-02-03 21:06:18 +08:00
|
|
|
}
|
|
|
|
|
2019-07-16 13:50:45 +08:00
|
|
|
// When a section is compressed, `rawData` consists with a header followed
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
// by zlib-compressed data. This function parses a header to initialize
|
2019-07-16 13:50:45 +08:00
|
|
|
// `uncompressedSize` member and remove the header from `rawData`.
|
2021-08-06 18:29:47 +08:00
|
|
|
template <typename ELFT> void InputSectionBase::parseCompressedHeader() {
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
// Old-style header
|
2022-01-20 14:33:32 +08:00
|
|
|
if (!(flags & SHF_COMPRESSED)) {
|
|
|
|
assert(name.startswith(".zdebug"));
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
if (!toStringRef(rawData).startswith("ZLIB")) {
|
|
|
|
error(toString(this) + ": corrupted compressed section header");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
rawData = rawData.slice(4);
|
2017-10-04 08:19:41 +08:00
|
|
|
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
if (rawData.size() < 8) {
|
|
|
|
error(toString(this) + ": corrupted compressed section header");
|
|
|
|
return;
|
|
|
|
}
|
2017-01-12 18:53:31 +08:00
|
|
|
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
uncompressedSize = read64be(rawData.data());
|
|
|
|
rawData = rawData.slice(8);
|
2017-08-17 08:27:55 +08:00
|
|
|
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
// Restore the original section name.
|
|
|
|
// (e.g. ".zdebug_info" -> ".debug_info")
|
2022-01-21 03:53:18 +08:00
|
|
|
name = saver().save("." + name.substr(2));
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2017-12-20 00:29:02 +08:00
|
|
|
flags &= ~(uint64_t)SHF_COMPRESSED;
|
2018-02-13 06:25:45 +08:00
|
|
|
|
2021-08-06 18:29:47 +08:00
|
|
|
// New-style header
|
|
|
|
if (rawData.size() < sizeof(typename ELFT::Chdr)) {
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
error(toString(this) + ": corrupted compressed section");
|
|
|
|
return;
|
2018-02-13 06:25:45 +08:00
|
|
|
}
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
|
2021-08-06 18:29:47 +08:00
|
|
|
auto *hdr = reinterpret_cast<const typename ELFT::Chdr *>(rawData.data());
|
2018-10-10 05:41:53 +08:00
|
|
|
if (hdr->ch_type != ELFCOMPRESS_ZLIB) {
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
error(toString(this) + ": unsupported compression type");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2018-10-10 05:41:53 +08:00
|
|
|
uncompressedSize = hdr->ch_size;
|
2019-04-26 12:07:58 +08:00
|
|
|
alignment = std::max<uint32_t>(hdr->ch_addralign, 1);
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
rawData = rawData.slice(sizeof(*hdr));
|
2016-06-24 19:18:44 +08:00
|
|
|
}
|
|
|
|
|
2017-06-01 03:09:52 +08:00
|
|
|
InputSection *InputSectionBase::getLinkOrderDep() const {
|
2018-03-08 23:06:58 +08:00
|
|
|
assert(flags & SHF_LINK_ORDER);
|
[ELF] Allow SHF_LINK_ORDER sections to have sh_link=0
Part of https://bugs.llvm.org/show_bug.cgi?id=41734
The semantics of SHF_LINK_ORDER have been extended to represent metadata
sections associated with some other sections (usually text).
The associated text section may be discarded (e.g. LTO) and we want the
metadata section to have sh_link=0 (D72899, D76802).
Normally the metadata section is only referenced by the associated text
section. sh_link=0 means the associated text section is discarded, and
the metadata section will be garbage collected. If there is another
section (.gc_root) referencing the metadata section, the metadata
section will be retained. It's the .gc_root consumer's job to validate
the metadata sections.
# This creates a SHF_LINK_ORDER .meta with sh_link=0
.section .meta,"awo",@progbits,0
1:
.section .meta,"awo",@progbits,foo
2:
.section .gc_root,"a",@progbits
.quad 1b
.quad 2b
Reviewed By: pcc, jhenderson
Differential Revision: https://reviews.llvm.org/D72904
2020-08-06 07:09:41 +08:00
|
|
|
if (!link)
|
|
|
|
return nullptr;
|
2018-03-08 23:06:58 +08:00
|
|
|
return cast<InputSection>(file->getSections()[link]);
|
2016-10-10 17:39:26 +08:00
|
|
|
}
|
|
|
|
|
2018-07-18 07:16:02 +08:00
|
|
|
// Find a function symbol that encloses a given location.
|
|
|
|
Defined *InputSectionBase::getEnclosingFunction(uint64_t offset) {
|
|
|
|
for (Symbol *b : file->getSymbols())
|
|
|
|
if (Defined *d = dyn_cast<Defined>(b))
|
2018-08-01 16:11:54 +08:00
|
|
|
if (d->section == this && d->type == STT_FUNC && d->value <= offset &&
|
|
|
|
offset < d->value + d->size)
|
2018-07-18 07:16:02 +08:00
|
|
|
return d;
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2021-10-29 00:38:45 +08:00
|
|
|
// Returns an object file location string. Used to construct an error message.
|
2017-02-23 10:28:28 +08:00
|
|
|
std::string InputSectionBase::getLocation(uint64_t offset) {
|
2021-10-29 00:38:45 +08:00
|
|
|
std::string secAndOffset =
|
|
|
|
(name + "+0x" + Twine::utohexstr(offset) + ")").str();
|
2019-01-10 23:08:06 +08:00
|
|
|
|
2017-03-14 16:33:45 +08:00
|
|
|
// We don't have file for synthetic sections.
|
2022-01-19 09:33:58 +08:00
|
|
|
if (file == nullptr)
|
2021-10-29 00:38:45 +08:00
|
|
|
return (config->outputFile + ":(" + secAndOffset).str();
|
2016-11-26 02:51:53 +08:00
|
|
|
|
2022-01-19 09:33:58 +08:00
|
|
|
std::string filename = toString(file);
|
2021-12-27 07:21:22 +08:00
|
|
|
if (Defined *d = getEnclosingFunction(offset))
|
2022-01-19 09:33:58 +08:00
|
|
|
return filename + ":(function " + toString(*d) + ": " + secAndOffset;
|
2016-11-26 02:51:53 +08:00
|
|
|
|
2022-01-19 09:33:58 +08:00
|
|
|
return filename + ":(" + secAndOffset;
|
2016-11-26 02:51:53 +08:00
|
|
|
}
|
|
|
|
|
2017-11-01 15:42:38 +08:00
|
|
|
// This function is intended to be used for constructing an error message.
|
|
|
|
// The returned message looks like this:
|
2017-03-31 03:13:47 +08:00
|
|
|
//
|
|
|
|
// foo.c:42 (/home/alice/possibly/very/long/path/foo.c:42)
|
|
|
|
//
|
2017-11-01 15:42:38 +08:00
|
|
|
// Returns an empty string if there's no way to get line info.
|
2017-11-04 05:21:47 +08:00
|
|
|
std::string InputSectionBase::getSrcMsg(const Symbol &sym, uint64_t offset) {
|
2017-12-24 01:21:39 +08:00
|
|
|
return file->getSrcMsg(sym, *this, offset);
|
2017-03-31 03:13:47 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Returns a filename string along with an optional section name. This
|
|
|
|
// function is intended to be used for constructing an error
|
|
|
|
// message. The returned message looks like this:
|
|
|
|
//
|
|
|
|
// path/to/foo.o:(function bar)
|
|
|
|
//
|
|
|
|
// or
|
|
|
|
//
|
|
|
|
// path/to/foo.o:(function bar) in archive path/to/bar.a
|
2017-10-27 11:13:54 +08:00
|
|
|
std::string InputSectionBase::getObjMsg(uint64_t off) {
|
2020-01-29 03:23:46 +08:00
|
|
|
std::string filename = std::string(file->getName());
|
2017-03-31 03:13:47 +08:00
|
|
|
|
|
|
|
std::string archive;
|
|
|
|
if (!file->archiveName.empty())
|
2021-12-15 12:55:32 +08:00
|
|
|
archive = (" in archive " + file->archiveName).str();
|
2017-03-31 03:13:47 +08:00
|
|
|
|
|
|
|
// Find a symbol that encloses a given location.
|
2017-11-04 05:21:47 +08:00
|
|
|
for (Symbol *b : file->getSymbols())
|
2017-11-06 12:35:31 +08:00
|
|
|
if (auto *d = dyn_cast<Defined>(b))
|
2017-03-31 03:13:47 +08:00
|
|
|
if (d->section == this && d->value <= off && off < d->value + d->size)
|
|
|
|
return filename + ":(" + toString(*d) + ")" + archive;
|
|
|
|
|
|
|
|
// If there's no symbol, print out the offset in the section.
|
|
|
|
return (filename + ":(" + name + "+0x" + utohexstr(off) + ")" + archive)
|
|
|
|
.str();
|
|
|
|
}
|
|
|
|
|
2017-12-21 10:11:51 +08:00
|
|
|
InputSection InputSection::discarded(nullptr, 0, 0, 0, ArrayRef<uint8_t>(), "");
|
2017-02-24 00:49:07 +08:00
|
|
|
|
2017-12-21 10:11:51 +08:00
|
|
|
InputSection::InputSection(InputFile *f, uint64_t flags, uint32_t type,
|
|
|
|
uint32_t alignment, ArrayRef<uint8_t> data,
|
|
|
|
StringRef name, Kind k)
|
|
|
|
: InputSectionBase(f, flags, type,
|
2017-03-07 23:11:21 +08:00
|
|
|
/*Entsize*/ 0, /*Link*/ 0, /*Info*/ 0, alignment, data,
|
2017-02-23 10:28:28 +08:00
|
|
|
name, k) {}
|
2016-10-26 08:54:03 +08:00
|
|
|
|
2015-07-25 05:03:07 +08:00
|
|
|
template <class ELFT>
|
2017-12-21 10:03:39 +08:00
|
|
|
InputSection::InputSection(ObjFile<ELFT> &f, const typename ELFT::Shdr &header,
|
2017-07-27 06:13:32 +08:00
|
|
|
StringRef name)
|
2017-02-23 10:28:28 +08:00
|
|
|
: InputSectionBase(f, header, name, InputSectionBase::Regular) {}
|
2015-10-20 05:00:02 +08:00
|
|
|
|
2017-03-09 06:36:28 +08:00
|
|
|
bool InputSection::classof(const SectionBase *s) {
|
|
|
|
return s->kind() == SectionBase::Regular ||
|
|
|
|
s->kind() == SectionBase::Synthetic;
|
|
|
|
}
|
|
|
|
|
2017-06-01 04:17:44 +08:00
|
|
|
OutputSection *InputSection::getParent() const {
|
|
|
|
return cast_or_null<OutputSection>(parent);
|
|
|
|
}
|
|
|
|
|
2017-06-09 11:19:08 +08:00
|
|
|
// Copy SHT_GROUP section contents. Used only for the -r option.
|
|
|
|
template <class ELFT> void InputSection::copyShtGroup(uint8_t *buf) {
|
|
|
|
// ELFT::Word is the 32-bit integral type in the target endianness.
|
2019-04-01 08:11:24 +08:00
|
|
|
using u32 = typename ELFT::Word;
|
2017-06-09 11:19:08 +08:00
|
|
|
ArrayRef<u32> from = getDataAs<u32>();
|
|
|
|
auto *to = reinterpret_cast<u32 *>(buf);
|
|
|
|
|
|
|
|
// The first entry is not a section number but a flag.
|
2017-05-29 16:37:50 +08:00
|
|
|
*to++ = from[0];
|
|
|
|
|
2020-07-21 23:49:04 +08:00
|
|
|
// Adjust section numbers because section numbers in an input object files are
|
|
|
|
// different in the output. We also need to handle combined or discarded
|
|
|
|
// members.
|
2017-12-20 00:29:02 +08:00
|
|
|
ArrayRef<InputSectionBase *> sections = file->getSections();
|
2022-01-30 17:18:41 +08:00
|
|
|
DenseSet<uint32_t> seen;
|
2020-07-21 23:49:04 +08:00
|
|
|
for (uint32_t idx : from.slice(1)) {
|
|
|
|
OutputSection *osec = sections[idx]->getOutputSection();
|
|
|
|
if (osec && seen.insert(osec->sectionIndex).second)
|
|
|
|
*to++ = osec->sectionIndex;
|
|
|
|
}
|
2017-05-29 16:37:50 +08:00
|
|
|
}
|
|
|
|
|
2018-05-23 09:58:43 +08:00
|
|
|
InputSectionBase *InputSection::getRelocatedSection() const {
|
2018-03-27 02:49:31 +08:00
|
|
|
if (!file || (type != SHT_RELA && type != SHT_REL))
|
|
|
|
return nullptr;
|
2017-12-20 00:29:02 +08:00
|
|
|
ArrayRef<InputSectionBase *> sections = file->getSections();
|
|
|
|
return sections[info];
|
2016-02-25 16:23:37 +08:00
|
|
|
}
|
|
|
|
|
2017-02-09 00:18:10 +08:00
|
|
|
// This is used for -r and --emit-relocs. We can't use memcpy to copy
|
|
|
|
// relocations because we need to update symbol table offset and section index
|
|
|
|
// for each relocation. So we copy relocations one by one.
|
2017-02-24 00:49:07 +08:00
|
|
|
template <class ELFT, class RelTy>
|
|
|
|
void InputSection::copyRelocations(uint8_t *buf, ArrayRef<RelTy> rels) {
|
2021-12-25 09:54:12 +08:00
|
|
|
const TargetInfo &target = *elf::target;
|
2017-10-10 12:53:14 +08:00
|
|
|
InputSectionBase *sec = getRelocatedSection();
|
|
|
|
|
2016-03-13 13:06:50 +08:00
|
|
|
for (const RelTy &rel : rels) {
|
2017-10-12 06:49:24 +08:00
|
|
|
RelType type = rel.getType(config->isMips64EL);
|
2019-06-26 16:09:08 +08:00
|
|
|
const ObjFile<ELFT> *file = getFile<ELFT>();
|
|
|
|
Symbol &sym = file->getRelocTargetSym(rel);
|
2016-02-25 16:23:37 +08:00
|
|
|
|
2017-02-24 00:49:07 +08:00
|
|
|
auto *p = reinterpret_cast<typename ELFT::Rela *>(buf);
|
2016-03-13 13:06:50 +08:00
|
|
|
buf += sizeof(RelTy);
|
2016-02-25 16:23:37 +08:00
|
|
|
|
2018-02-02 17:50:07 +08:00
|
|
|
if (RelTy::IsRela)
|
2016-08-02 16:49:57 +08:00
|
|
|
p->r_addend = getAddend<ELFT>(rel);
|
2017-02-09 00:18:10 +08:00
|
|
|
|
2017-02-15 09:53:23 +08:00
|
|
|
// Output section VA is zero for -r, so r_offset is an offset within the
|
2020-01-07 02:21:05 +08:00
|
|
|
// section, but for --emit-relocs it is a virtual address.
|
2018-03-24 08:35:11 +08:00
|
|
|
p->r_offset = sec->getVA(rel.r_offset);
|
2018-09-26 03:26:58 +08:00
|
|
|
p->setSymbolAndType(in.symTab->getSymbolIndex(&sym), type,
|
2017-03-18 07:29:01 +08:00
|
|
|
config->isMips64EL);
|
2017-02-15 09:53:23 +08:00
|
|
|
|
2017-11-04 08:31:04 +08:00
|
|
|
if (sym.type == STT_SECTION) {
|
2017-02-11 09:40:49 +08:00
|
|
|
// We combine multiple section symbols into only one per
|
|
|
|
// section. This means we have to update the addend. That is
|
|
|
|
// trivial for Elf_Rela, but for Elf_Rel we have to write to the
|
|
|
|
// section data. We do that by adding to the Relocation vector.
|
2017-02-15 08:59:50 +08:00
|
|
|
|
|
|
|
// .eh_frame is horribly special and can reference discarded sections. To
|
|
|
|
// avoid having to parse and recreate .eh_frame, we just replace any
|
|
|
|
// relocation in it pointing to discarded sections with R_*_NONE, which
|
2019-06-26 16:09:08 +08:00
|
|
|
// hopefully creates a frame that is ignored at runtime. Also, don't warn
|
|
|
|
// on .gcc_except_table and debug sections.
|
|
|
|
//
|
2020-03-01 10:40:58 +08:00
|
|
|
// See the comment in maybeReportUndefined for PPC32 .got2 and PPC64 .toc
|
2017-11-30 14:18:31 +08:00
|
|
|
auto *d = dyn_cast<Defined>(&sym);
|
|
|
|
if (!d) {
|
2020-02-13 06:08:42 +08:00
|
|
|
if (!isDebugSection(*sec) && sec->name != ".eh_frame" &&
|
2020-03-01 10:40:58 +08:00
|
|
|
sec->name != ".gcc_except_table" && sec->name != ".got2" &&
|
|
|
|
sec->name != ".toc") {
|
2019-06-26 16:09:08 +08:00
|
|
|
uint32_t secIdx = cast<Undefined>(sym).discardedSecIdx;
|
2021-12-25 09:10:38 +08:00
|
|
|
Elf_Shdr_Impl<ELFT> sec = file->template getELFShdrs<ELFT>()[secIdx];
|
2019-06-26 16:09:08 +08:00
|
|
|
warn("relocation refers to a discarded section: " +
|
2020-09-09 22:03:53 +08:00
|
|
|
CHECK(file->getObj().getSectionName(sec), file) +
|
2019-06-26 16:09:08 +08:00
|
|
|
"\n>>> referenced by " + getObjMsg(p->r_offset));
|
|
|
|
}
|
|
|
|
p->setSymbolAndType(0, 0, false);
|
2017-11-30 14:18:31 +08:00
|
|
|
continue;
|
|
|
|
}
|
2021-12-25 04:09:48 +08:00
|
|
|
SectionBase *section = d->section;
|
2019-05-29 11:55:20 +08:00
|
|
|
if (!section->isLive()) {
|
2017-02-15 08:59:50 +08:00
|
|
|
p->setSymbolAndType(0, 0, false);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2018-05-08 23:34:06 +08:00
|
|
|
int64_t addend = getAddend<ELFT>(rel);
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
const uint8_t *bufLoc = sec->data().begin() + rel.r_offset;
|
2018-05-08 23:34:06 +08:00
|
|
|
if (!RelTy::IsRela)
|
2021-12-25 09:54:12 +08:00
|
|
|
addend = target.getImplicitAddend(bufLoc, type);
|
2018-05-08 23:34:06 +08:00
|
|
|
|
2020-02-01 13:00:28 +08:00
|
|
|
if (config->emachine == EM_MIPS &&
|
2021-12-25 09:54:12 +08:00
|
|
|
target.getRelExpr(type, sym, bufLoc) == R_MIPS_GOTREL) {
|
2018-05-08 23:34:06 +08:00
|
|
|
// Some MIPS relocations depend on "gp" value. By default,
|
|
|
|
// this value has 0x7ff0 offset from a .got section. But
|
2019-10-29 09:41:38 +08:00
|
|
|
// relocatable files produced by a compiler or a linker
|
2018-05-08 23:34:06 +08:00
|
|
|
// might redefine this default value and we must use it
|
|
|
|
// for a calculation of the relocation result. When we
|
|
|
|
// generate EXE or DSO it's trivial. Generating a relocatable
|
|
|
|
// output is more difficult case because the linker does
|
|
|
|
// not calculate relocations in this mode and loses
|
|
|
|
// individual "gp" values used by each input object file.
|
|
|
|
// As a workaround we add the "gp" value to the relocation
|
|
|
|
// addend and save it back to the file.
|
|
|
|
addend += sec->getFile<ELFT>()->mipsGp0;
|
2017-02-11 09:40:49 +08:00
|
|
|
}
|
|
|
|
|
2018-05-08 23:34:06 +08:00
|
|
|
if (RelTy::IsRela)
|
2018-11-01 17:20:06 +08:00
|
|
|
p->r_addend = sym.getVA(addend) - section->getOutputSection()->addr;
|
2021-12-25 09:54:12 +08:00
|
|
|
else if (config->relocatable && type != target.noneRel)
|
2018-05-08 23:34:06 +08:00
|
|
|
sec->relocations.push_back({R_ABS, type, rel.r_offset, addend, &sym});
|
[ELF][PPC32] Support --emit-relocs link of R_PPC_PLTREL24
Similar to R_MIPS_GPREL16 and R_MIPS_GPREL32 (D45972).
If the addend of an R_PPC_PLTREL24 is >= 0x8000, it indicates that r30
is relative to the input section .got2.
```
addis 30, 30, .got2+0x8000-.L1$pb@ha
addi 30, 30, .got2+0x8000-.L1$pb@l
...
bl foo+0x8000@PLT
```
After linking, the relocation will be relative to the output section .got2.
To compensate for the shift `address(input section .got2) - address(output section .got2) = ppc32Got2OutSecOff`, adjust by `ppc32Got2OutSecOff`:
```
addis 30, 30, .got2+0x8000-.L1+ppc32Got2OutSecOff$pb@ha
addi 30, 30, .got2+0x8000-.L1+ppc32Got2OutSecOff$pb@ha$pb@l
...
bl foo+0x8000+ppc32Got2OutSecOff@PLT
```
This rule applys to a relocatable link or a non-relocatable link with --emit-relocs.
Reviewed By: Bdragon28
Differential Revision: https://reviews.llvm.org/D73532
2020-01-28 16:01:20 +08:00
|
|
|
} else if (config->emachine == EM_PPC && type == R_PPC_PLTREL24 &&
|
2021-12-24 03:32:44 +08:00
|
|
|
p->r_addend >= 0x8000 && sec->file->ppc32Got2) {
|
[ELF][PPC32] Support --emit-relocs link of R_PPC_PLTREL24
Similar to R_MIPS_GPREL16 and R_MIPS_GPREL32 (D45972).
If the addend of an R_PPC_PLTREL24 is >= 0x8000, it indicates that r30
is relative to the input section .got2.
```
addis 30, 30, .got2+0x8000-.L1$pb@ha
addi 30, 30, .got2+0x8000-.L1$pb@l
...
bl foo+0x8000@PLT
```
After linking, the relocation will be relative to the output section .got2.
To compensate for the shift `address(input section .got2) - address(output section .got2) = ppc32Got2OutSecOff`, adjust by `ppc32Got2OutSecOff`:
```
addis 30, 30, .got2+0x8000-.L1+ppc32Got2OutSecOff$pb@ha
addi 30, 30, .got2+0x8000-.L1+ppc32Got2OutSecOff$pb@ha$pb@l
...
bl foo+0x8000+ppc32Got2OutSecOff@PLT
```
This rule applys to a relocatable link or a non-relocatable link with --emit-relocs.
Reviewed By: Bdragon28
Differential Revision: https://reviews.llvm.org/D73532
2020-01-28 16:01:20 +08:00
|
|
|
// Similar to R_MIPS_GPREL{16,32}. If the addend of R_PPC_PLTREL24
|
|
|
|
// indicates that r30 is relative to the input section .got2
|
|
|
|
// (r_addend>=0x8000), after linking, r30 should be relative to the output
|
|
|
|
// section .got2 . To compensate for the shift, adjust r_addend by
|
2021-12-24 03:32:44 +08:00
|
|
|
// ppc32Got->outSecOff.
|
|
|
|
p->r_addend += sec->file->ppc32Got2->outSecOff;
|
2018-05-08 23:34:06 +08:00
|
|
|
}
|
2016-02-25 16:23:37 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-19 17:43:43 +08:00
|
|
|
// The ARM and AArch64 ABI handle pc-relative relocations to undefined weak
|
|
|
|
// references specially. The general rule is that the value of the symbol in
|
|
|
|
// this context is the address of the place P. A further special case is that
|
|
|
|
// branch relocations to an undefined weak reference resolve to the next
|
|
|
|
// instruction.
|
2017-10-12 06:49:24 +08:00
|
|
|
static uint32_t getARMUndefinedRelativeWeakVA(RelType type, uint32_t a,
|
2016-11-09 18:22:29 +08:00
|
|
|
uint32_t p) {
|
|
|
|
switch (type) {
|
2017-06-19 17:43:43 +08:00
|
|
|
// Unresolved branch relocations to weak references resolve to next
|
|
|
|
// instruction, this will be either 2 or 4 bytes on from P.
|
2021-11-10 04:35:44 +08:00
|
|
|
case R_ARM_THM_JUMP8:
|
2016-11-09 18:22:29 +08:00
|
|
|
case R_ARM_THM_JUMP11:
|
2017-06-13 02:05:01 +08:00
|
|
|
return p + 2 + a;
|
2016-11-09 18:22:29 +08:00
|
|
|
case R_ARM_CALL:
|
|
|
|
case R_ARM_JUMP24:
|
|
|
|
case R_ARM_PC24:
|
|
|
|
case R_ARM_PLT32:
|
|
|
|
case R_ARM_PREL31:
|
|
|
|
case R_ARM_THM_JUMP19:
|
|
|
|
case R_ARM_THM_JUMP24:
|
2017-06-13 02:05:01 +08:00
|
|
|
return p + 4 + a;
|
2016-11-09 18:22:29 +08:00
|
|
|
case R_ARM_THM_CALL:
|
|
|
|
// We don't want an interworking BLX to ARM
|
2017-06-13 02:05:01 +08:00
|
|
|
return p + 5 + a;
|
2017-06-19 17:43:43 +08:00
|
|
|
// Unresolved non branch pc-relative relocations
|
|
|
|
// R_ARM_TARGET2 which can be resolved relatively is not present as it never
|
|
|
|
// targets a weak-reference.
|
|
|
|
case R_ARM_MOVW_PREL_NC:
|
|
|
|
case R_ARM_MOVT_PREL:
|
|
|
|
case R_ARM_REL32:
|
2020-02-24 03:46:46 +08:00
|
|
|
case R_ARM_THM_ALU_PREL_11_0:
|
2017-06-19 17:43:43 +08:00
|
|
|
case R_ARM_THM_MOVW_PREL_NC:
|
|
|
|
case R_ARM_THM_MOVT_PREL:
|
2020-02-24 03:46:46 +08:00
|
|
|
case R_ARM_THM_PC12:
|
2017-06-13 02:05:01 +08:00
|
|
|
return p + a;
|
2020-02-24 03:46:46 +08:00
|
|
|
// p + a is unrepresentable as negative immediates can't be encoded.
|
|
|
|
case R_ARM_THM_PC8:
|
|
|
|
return p;
|
2016-11-09 18:22:29 +08:00
|
|
|
}
|
2017-06-19 17:43:43 +08:00
|
|
|
llvm_unreachable("ARM pc-relative relocation expected\n");
|
2016-11-09 18:22:29 +08:00
|
|
|
}
|
|
|
|
|
2017-06-19 17:43:43 +08:00
|
|
|
// The comment above getARMUndefinedRelativeWeakVA applies to this function.
|
2021-06-11 04:30:16 +08:00
|
|
|
static uint64_t getAArch64UndefinedRelativeWeakVA(uint64_t type, uint64_t p) {
|
2016-11-09 18:22:29 +08:00
|
|
|
switch (type) {
|
2017-06-19 17:43:43 +08:00
|
|
|
// Unresolved branch relocations to weak references resolve to next
|
|
|
|
// instruction, this is 4 bytes on from P.
|
2016-11-09 18:22:29 +08:00
|
|
|
case R_AARCH64_CALL26:
|
|
|
|
case R_AARCH64_CONDBR19:
|
|
|
|
case R_AARCH64_JUMP26:
|
|
|
|
case R_AARCH64_TSTBR14:
|
2021-06-11 04:30:16 +08:00
|
|
|
return p + 4;
|
2017-06-19 17:43:43 +08:00
|
|
|
// Unresolved non branch pc-relative relocations
|
|
|
|
case R_AARCH64_PREL16:
|
|
|
|
case R_AARCH64_PREL32:
|
|
|
|
case R_AARCH64_PREL64:
|
|
|
|
case R_AARCH64_ADR_PREL_LO21:
|
2017-09-21 07:49:50 +08:00
|
|
|
case R_AARCH64_LD_PREL_LO19:
|
2020-06-24 07:10:07 +08:00
|
|
|
case R_AARCH64_PLT32:
|
2021-06-11 04:30:16 +08:00
|
|
|
return p;
|
2016-11-09 18:22:29 +08:00
|
|
|
}
|
2017-06-19 17:43:43 +08:00
|
|
|
llvm_unreachable("AArch64 pc-relative relocation expected\n");
|
2016-11-09 18:22:29 +08:00
|
|
|
}
|
|
|
|
|
2021-06-11 04:25:16 +08:00
|
|
|
static uint64_t getRISCVUndefinedRelativeWeakVA(uint64_t type, uint64_t p) {
|
|
|
|
switch (type) {
|
|
|
|
case R_RISCV_BRANCH:
|
|
|
|
case R_RISCV_JAL:
|
|
|
|
case R_RISCV_CALL:
|
|
|
|
case R_RISCV_CALL_PLT:
|
|
|
|
case R_RISCV_RVC_BRANCH:
|
|
|
|
case R_RISCV_RVC_JUMP:
|
|
|
|
return p;
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-18 17:12:21 +08:00
|
|
|
// ARM SBREL relocations are of the form S + A - B where B is the static base
|
|
|
|
// The ARM ABI defines base to be "addressing origin of the output segment
|
|
|
|
// defining the symbol S". We defined the "addressing origin"/static base to be
|
2017-11-04 08:31:04 +08:00
|
|
|
// the base of the PT_LOAD segment containing the Sym.
|
2017-05-18 17:12:21 +08:00
|
|
|
// The procedure call standard only defines a Read Write Position Independent
|
|
|
|
// RWPI variant so in practice we should expect the static base to be the base
|
|
|
|
// of the RW segment.
|
2017-11-04 08:31:04 +08:00
|
|
|
static uint64_t getARMStaticBase(const Symbol &sym) {
|
|
|
|
OutputSection *os = sym.getOutputSection();
|
2017-09-07 19:01:10 +08:00
|
|
|
if (!os || !os->ptLoad || !os->ptLoad->firstSec)
|
2017-11-04 08:31:04 +08:00
|
|
|
fatal("SBREL relocation to " + sym.getName() + " without static base");
|
2017-09-07 19:01:10 +08:00
|
|
|
return os->ptLoad->firstSec->addr;
|
2017-05-18 17:12:21 +08:00
|
|
|
}
|
|
|
|
|
2018-08-10 01:59:56 +08:00
|
|
|
// For R_RISCV_PC_INDIRECT (R_RISCV_PCREL_LO12_{I,S}), the symbol actually
|
|
|
|
// points the corresponding R_RISCV_PCREL_HI20 relocation, and the target VA
|
|
|
|
// is calculated using PCREL_HI20's symbol.
|
|
|
|
//
|
|
|
|
// This function returns the R_RISCV_PCREL_HI20 relocation from
|
|
|
|
// R_RISCV_PCREL_LO12's symbol and addend.
|
2018-11-23 23:13:26 +08:00
|
|
|
static Relocation *getRISCVPCRelHi20(const Symbol *sym, uint64_t addend) {
|
2018-08-10 01:59:56 +08:00
|
|
|
const Defined *d = cast<Defined>(sym);
|
2019-07-03 23:38:59 +08:00
|
|
|
if (!d->section) {
|
|
|
|
error("R_RISCV_PCREL_LO12 relocation points to an absolute symbol: " +
|
|
|
|
sym->getName());
|
|
|
|
return nullptr;
|
|
|
|
}
|
2018-08-10 01:59:56 +08:00
|
|
|
InputSection *isec = cast<InputSection>(d->section);
|
|
|
|
|
|
|
|
if (addend != 0)
|
2022-01-16 02:46:25 +08:00
|
|
|
warn("non-zero addend in R_RISCV_PCREL_LO12 relocation to " +
|
2018-08-10 01:59:56 +08:00
|
|
|
isec->getObjMsg(d->value) + " is ignored");
|
|
|
|
|
|
|
|
// Relocations are sorted by offset, so we can use std::equal_range to do
|
|
|
|
// binary search.
|
2019-02-15 03:21:10 +08:00
|
|
|
Relocation r;
|
|
|
|
r.offset = d->value;
|
|
|
|
auto range =
|
|
|
|
std::equal_range(isec->relocations.begin(), isec->relocations.end(), r,
|
|
|
|
[](const Relocation &lhs, const Relocation &rhs) {
|
|
|
|
return lhs.offset < rhs.offset;
|
|
|
|
});
|
|
|
|
|
|
|
|
for (auto it = range.first; it != range.second; ++it)
|
2019-07-02 01:12:26 +08:00
|
|
|
if (it->type == R_RISCV_PCREL_HI20 || it->type == R_RISCV_GOT_HI20 ||
|
|
|
|
it->type == R_RISCV_TLS_GD_HI20 || it->type == R_RISCV_TLS_GOT_HI20)
|
2018-08-10 01:59:56 +08:00
|
|
|
return &*it;
|
|
|
|
|
|
|
|
error("R_RISCV_PCREL_LO12 relocation points to " + isec->getObjMsg(d->value) +
|
|
|
|
" without an associated R_RISCV_PCREL_HI20 relocation");
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
[ELF] Refactor per-target TLS layout configuration. NFC.
Summary:
There are really three different kinds of TLS layouts:
* A fixed TLS-to-TP offset. On architectures like PowerPC, MIPS, and
RISC-V, the thread pointer points to a fixed offset from the start
of the executable's TLS segment. The offset is 0x7000 for PowerPC
and MIPS, which allows a signed 16-bit offset to reach 0x1000 of
per-thread implementation data and 0xf000 of the application's TLS
segment. The size and layout of the TCB isn't relevant to the static
linker and might not be known.
* A fixed TCB size. This is the format documented as "variant 1" in
Ulrich Drepper's TLS spec. The thread pointer points to a 2-word TCB
followed by the executable's TLS segment. The first word is always
the DTV pointer. Used on ARM. The thread pointer must be aligned to
the TLS segment's alignment, possibly creating alignment padding.
* Variant 2. This format predates variant 1 and is also documented in
Drepper's TLS spec. It allocates the executable's TLS segment before
the thread pointer, apparently for backwards-compatibility. It's
used on x86 and SPARC.
Factor out an lld::elf::getTlsTpOffset() function for use in a
follow-up patch for Android. The TcbSize/TlsTpOffset fields are only used
in getTlsTpOffset, so replace them with a switch on Config->EMachine.
Reviewers: espindola, ruiu, PkmX, jrtc27
Reviewed By: ruiu, PkmX, jrtc27
Subscribers: jyknight, emaste, sdardis, nemanjai, javed.absar, arichardson, kristof.beyls, kbarton, fedor.sergeev, atanasyan, PkmX, jsji, llvm-commits
Differential Revision: https://reviews.llvm.org/D53905
llvm-svn: 345775
2018-11-01 04:53:17 +08:00
|
|
|
// A TLS symbol's virtual address is relative to the TLS segment. Add a
|
|
|
|
// target-specific adjustment to produce a thread-pointer-relative offset.
|
2019-05-30 18:00:20 +08:00
|
|
|
static int64_t getTlsTpOffset(const Symbol &s) {
|
|
|
|
// On targets that support TLSDESC, _TLS_MODULE_BASE_@tpoff = 0.
|
|
|
|
if (&s == ElfSym::tlsModuleBase)
|
|
|
|
return 0;
|
|
|
|
|
[ELF][PPC] Allow PT_LOAD to have overlapping p_offset ranges
This change affects the non-linker script case (precisely, when the
`SECTIONS` command is not used). It deletes 3 alignments at PT_LOAD
boundaries for the default case: the size of a powerpc64 binary can be
decreased by at most 192kb. The technique can be ported to other
targets.
Let me demonstrate the idea with a maxPageSize=65536 example:
When assigning the address to the first output section of a new PT_LOAD,
if the end p_vaddr of the previous PT_LOAD is 0x10020, we advance to
the next multiple of maxPageSize: 0x20000. The new PT_LOAD will thus
have p_vaddr=0x20000. Because p_offset and p_vaddr are congruent modulo
maxPageSize, p_offset will be 0x20000, leaving a p_offset gap [0x10020,
0x20000) in the output.
Alternatively, if we advance to 0x20020, the new PT_LOAD will have
p_vaddr=0x20020. We can pick either 0x10020 or 0x20020 for p_offset!
Obviously 0x10020 is the choice because it leaves no gap. At runtime,
p_vaddr will be rounded down by pagesize (65536 if
pagesize=maxPageSize). This PT_LOAD will load additional initial
contents from p_offset ranges [0x10000,0x10020), which will also be
loaded by the previous PT_LOAD. This is fine if -z noseparate-code is in
effect or if we are not transiting between executable and non-executable
segments.
ld.bfd -z noseparate-code leverages this technique to keep output small.
This patch implements the technique in lld, which is mostly effective on
targets with large defaultMaxPageSize (AArch64/MIPS/PPC: 65536). The 3
removed alignments can save almost 3*65536 bytes.
Two places that rely on p_vaddr%pagesize = 0 have to be updated.
1) We used to round p_memsz(PT_GNU_RELRO) up to commonPageSize (defaults
to 4096 on all targets). Now p_vaddr%commonPageSize may be non-zero.
The updated formula takes account of that factor.
2) Our TP offsets formulae are only correct if p_vaddr%p_align = 0.
Fix them. See the updated comments in InputSection.cpp for details.
On targets that we enable the technique (only PPC64 now),
we can potentially make `p_vaddr(PT_TLS)%p_align(PT_TLS) != 0`
if `sh_addralign(.tdata) < sh_addralign(.tbss)`
This exposes many problems in ld.so implementations, especially the
offsets of dynamic TLS blocks. Known issues:
FreeBSD 13.0-CURRENT rtld-elf (i386/amd64/powerpc/arm64)
glibc (HEAD) i386 and x86_64 https://sourceware.org/bugzilla/show_bug.cgi?id=24606
musl<=1.1.22 on TLS Variant I architectures (aarch64/powerpc64/...)
So, force p_vaddr%p_align = 0 by rounding dot up to p_align(PT_TLS).
The technique will be enabled (with updated tests) for other targets in
subsequent patches.
Reviewed By: ruiu
Differential Revision: https://reviews.llvm.org/D64906
llvm-svn: 369343
2019-08-20 16:34:25 +08:00
|
|
|
// There are 2 TLS layouts. Among targets we support, x86 uses TLS Variant 2
|
|
|
|
// while most others use Variant 1. At run time TP will be aligned to p_align.
|
|
|
|
|
|
|
|
// Variant 1. TP will be followed by an optional gap (which is the size of 2
|
|
|
|
// pointers on ARM/AArch64, 0 on other targets), followed by alignment
|
|
|
|
// padding, then the static TLS blocks. The alignment padding is added so that
|
|
|
|
// (TP + gap + padding) is congruent to p_vaddr modulo p_align.
|
|
|
|
//
|
|
|
|
// Variant 2. Static TLS blocks, followed by alignment padding are placed
|
|
|
|
// before TP. The alignment padding is added so that (TP - padding -
|
|
|
|
// p_memsz) is congruent to p_vaddr modulo p_align.
|
2019-10-07 16:31:18 +08:00
|
|
|
PhdrEntry *tls = Out::tlsPhdr;
|
[ELF] Refactor per-target TLS layout configuration. NFC.
Summary:
There are really three different kinds of TLS layouts:
* A fixed TLS-to-TP offset. On architectures like PowerPC, MIPS, and
RISC-V, the thread pointer points to a fixed offset from the start
of the executable's TLS segment. The offset is 0x7000 for PowerPC
and MIPS, which allows a signed 16-bit offset to reach 0x1000 of
per-thread implementation data and 0xf000 of the application's TLS
segment. The size and layout of the TCB isn't relevant to the static
linker and might not be known.
* A fixed TCB size. This is the format documented as "variant 1" in
Ulrich Drepper's TLS spec. The thread pointer points to a 2-word TCB
followed by the executable's TLS segment. The first word is always
the DTV pointer. Used on ARM. The thread pointer must be aligned to
the TLS segment's alignment, possibly creating alignment padding.
* Variant 2. This format predates variant 1 and is also documented in
Drepper's TLS spec. It allocates the executable's TLS segment before
the thread pointer, apparently for backwards-compatibility. It's
used on x86 and SPARC.
Factor out an lld::elf::getTlsTpOffset() function for use in a
follow-up patch for Android. The TcbSize/TlsTpOffset fields are only used
in getTlsTpOffset, so replace them with a switch on Config->EMachine.
Reviewers: espindola, ruiu, PkmX, jrtc27
Reviewed By: ruiu, PkmX, jrtc27
Subscribers: jyknight, emaste, sdardis, nemanjai, javed.absar, arichardson, kristof.beyls, kbarton, fedor.sergeev, atanasyan, PkmX, jsji, llvm-commits
Differential Revision: https://reviews.llvm.org/D53905
llvm-svn: 345775
2018-11-01 04:53:17 +08:00
|
|
|
switch (config->emachine) {
|
[ELF][PPC] Allow PT_LOAD to have overlapping p_offset ranges
This change affects the non-linker script case (precisely, when the
`SECTIONS` command is not used). It deletes 3 alignments at PT_LOAD
boundaries for the default case: the size of a powerpc64 binary can be
decreased by at most 192kb. The technique can be ported to other
targets.
Let me demonstrate the idea with a maxPageSize=65536 example:
When assigning the address to the first output section of a new PT_LOAD,
if the end p_vaddr of the previous PT_LOAD is 0x10020, we advance to
the next multiple of maxPageSize: 0x20000. The new PT_LOAD will thus
have p_vaddr=0x20000. Because p_offset and p_vaddr are congruent modulo
maxPageSize, p_offset will be 0x20000, leaving a p_offset gap [0x10020,
0x20000) in the output.
Alternatively, if we advance to 0x20020, the new PT_LOAD will have
p_vaddr=0x20020. We can pick either 0x10020 or 0x20020 for p_offset!
Obviously 0x10020 is the choice because it leaves no gap. At runtime,
p_vaddr will be rounded down by pagesize (65536 if
pagesize=maxPageSize). This PT_LOAD will load additional initial
contents from p_offset ranges [0x10000,0x10020), which will also be
loaded by the previous PT_LOAD. This is fine if -z noseparate-code is in
effect or if we are not transiting between executable and non-executable
segments.
ld.bfd -z noseparate-code leverages this technique to keep output small.
This patch implements the technique in lld, which is mostly effective on
targets with large defaultMaxPageSize (AArch64/MIPS/PPC: 65536). The 3
removed alignments can save almost 3*65536 bytes.
Two places that rely on p_vaddr%pagesize = 0 have to be updated.
1) We used to round p_memsz(PT_GNU_RELRO) up to commonPageSize (defaults
to 4096 on all targets). Now p_vaddr%commonPageSize may be non-zero.
The updated formula takes account of that factor.
2) Our TP offsets formulae are only correct if p_vaddr%p_align = 0.
Fix them. See the updated comments in InputSection.cpp for details.
On targets that we enable the technique (only PPC64 now),
we can potentially make `p_vaddr(PT_TLS)%p_align(PT_TLS) != 0`
if `sh_addralign(.tdata) < sh_addralign(.tbss)`
This exposes many problems in ld.so implementations, especially the
offsets of dynamic TLS blocks. Known issues:
FreeBSD 13.0-CURRENT rtld-elf (i386/amd64/powerpc/arm64)
glibc (HEAD) i386 and x86_64 https://sourceware.org/bugzilla/show_bug.cgi?id=24606
musl<=1.1.22 on TLS Variant I architectures (aarch64/powerpc64/...)
So, force p_vaddr%p_align = 0 by rounding dot up to p_align(PT_TLS).
The technique will be enabled (with updated tests) for other targets in
subsequent patches.
Reviewed By: ruiu
Differential Revision: https://reviews.llvm.org/D64906
llvm-svn: 369343
2019-08-20 16:34:25 +08:00
|
|
|
// Variant 1.
|
[ELF] Refactor per-target TLS layout configuration. NFC.
Summary:
There are really three different kinds of TLS layouts:
* A fixed TLS-to-TP offset. On architectures like PowerPC, MIPS, and
RISC-V, the thread pointer points to a fixed offset from the start
of the executable's TLS segment. The offset is 0x7000 for PowerPC
and MIPS, which allows a signed 16-bit offset to reach 0x1000 of
per-thread implementation data and 0xf000 of the application's TLS
segment. The size and layout of the TCB isn't relevant to the static
linker and might not be known.
* A fixed TCB size. This is the format documented as "variant 1" in
Ulrich Drepper's TLS spec. The thread pointer points to a 2-word TCB
followed by the executable's TLS segment. The first word is always
the DTV pointer. Used on ARM. The thread pointer must be aligned to
the TLS segment's alignment, possibly creating alignment padding.
* Variant 2. This format predates variant 1 and is also documented in
Drepper's TLS spec. It allocates the executable's TLS segment before
the thread pointer, apparently for backwards-compatibility. It's
used on x86 and SPARC.
Factor out an lld::elf::getTlsTpOffset() function for use in a
follow-up patch for Android. The TcbSize/TlsTpOffset fields are only used
in getTlsTpOffset, so replace them with a switch on Config->EMachine.
Reviewers: espindola, ruiu, PkmX, jrtc27
Reviewed By: ruiu, PkmX, jrtc27
Subscribers: jyknight, emaste, sdardis, nemanjai, javed.absar, arichardson, kristof.beyls, kbarton, fedor.sergeev, atanasyan, PkmX, jsji, llvm-commits
Differential Revision: https://reviews.llvm.org/D53905
llvm-svn: 345775
2018-11-01 04:53:17 +08:00
|
|
|
case EM_ARM:
|
|
|
|
case EM_AARCH64:
|
[ELF][PPC] Allow PT_LOAD to have overlapping p_offset ranges
This change affects the non-linker script case (precisely, when the
`SECTIONS` command is not used). It deletes 3 alignments at PT_LOAD
boundaries for the default case: the size of a powerpc64 binary can be
decreased by at most 192kb. The technique can be ported to other
targets.
Let me demonstrate the idea with a maxPageSize=65536 example:
When assigning the address to the first output section of a new PT_LOAD,
if the end p_vaddr of the previous PT_LOAD is 0x10020, we advance to
the next multiple of maxPageSize: 0x20000. The new PT_LOAD will thus
have p_vaddr=0x20000. Because p_offset and p_vaddr are congruent modulo
maxPageSize, p_offset will be 0x20000, leaving a p_offset gap [0x10020,
0x20000) in the output.
Alternatively, if we advance to 0x20020, the new PT_LOAD will have
p_vaddr=0x20020. We can pick either 0x10020 or 0x20020 for p_offset!
Obviously 0x10020 is the choice because it leaves no gap. At runtime,
p_vaddr will be rounded down by pagesize (65536 if
pagesize=maxPageSize). This PT_LOAD will load additional initial
contents from p_offset ranges [0x10000,0x10020), which will also be
loaded by the previous PT_LOAD. This is fine if -z noseparate-code is in
effect or if we are not transiting between executable and non-executable
segments.
ld.bfd -z noseparate-code leverages this technique to keep output small.
This patch implements the technique in lld, which is mostly effective on
targets with large defaultMaxPageSize (AArch64/MIPS/PPC: 65536). The 3
removed alignments can save almost 3*65536 bytes.
Two places that rely on p_vaddr%pagesize = 0 have to be updated.
1) We used to round p_memsz(PT_GNU_RELRO) up to commonPageSize (defaults
to 4096 on all targets). Now p_vaddr%commonPageSize may be non-zero.
The updated formula takes account of that factor.
2) Our TP offsets formulae are only correct if p_vaddr%p_align = 0.
Fix them. See the updated comments in InputSection.cpp for details.
On targets that we enable the technique (only PPC64 now),
we can potentially make `p_vaddr(PT_TLS)%p_align(PT_TLS) != 0`
if `sh_addralign(.tdata) < sh_addralign(.tbss)`
This exposes many problems in ld.so implementations, especially the
offsets of dynamic TLS blocks. Known issues:
FreeBSD 13.0-CURRENT rtld-elf (i386/amd64/powerpc/arm64)
glibc (HEAD) i386 and x86_64 https://sourceware.org/bugzilla/show_bug.cgi?id=24606
musl<=1.1.22 on TLS Variant I architectures (aarch64/powerpc64/...)
So, force p_vaddr%p_align = 0 by rounding dot up to p_align(PT_TLS).
The technique will be enabled (with updated tests) for other targets in
subsequent patches.
Reviewed By: ruiu
Differential Revision: https://reviews.llvm.org/D64906
llvm-svn: 369343
2019-08-20 16:34:25 +08:00
|
|
|
return s.getVA(0) + config->wordsize * 2 +
|
|
|
|
((tls->p_vaddr - config->wordsize * 2) & (tls->p_align - 1));
|
2019-07-24 19:37:13 +08:00
|
|
|
case EM_MIPS:
|
2019-06-07 01:03:10 +08:00
|
|
|
case EM_PPC:
|
[ELF] Refactor per-target TLS layout configuration. NFC.
Summary:
There are really three different kinds of TLS layouts:
* A fixed TLS-to-TP offset. On architectures like PowerPC, MIPS, and
RISC-V, the thread pointer points to a fixed offset from the start
of the executable's TLS segment. The offset is 0x7000 for PowerPC
and MIPS, which allows a signed 16-bit offset to reach 0x1000 of
per-thread implementation data and 0xf000 of the application's TLS
segment. The size and layout of the TCB isn't relevant to the static
linker and might not be known.
* A fixed TCB size. This is the format documented as "variant 1" in
Ulrich Drepper's TLS spec. The thread pointer points to a 2-word TCB
followed by the executable's TLS segment. The first word is always
the DTV pointer. Used on ARM. The thread pointer must be aligned to
the TLS segment's alignment, possibly creating alignment padding.
* Variant 2. This format predates variant 1 and is also documented in
Drepper's TLS spec. It allocates the executable's TLS segment before
the thread pointer, apparently for backwards-compatibility. It's
used on x86 and SPARC.
Factor out an lld::elf::getTlsTpOffset() function for use in a
follow-up patch for Android. The TcbSize/TlsTpOffset fields are only used
in getTlsTpOffset, so replace them with a switch on Config->EMachine.
Reviewers: espindola, ruiu, PkmX, jrtc27
Reviewed By: ruiu, PkmX, jrtc27
Subscribers: jyknight, emaste, sdardis, nemanjai, javed.absar, arichardson, kristof.beyls, kbarton, fedor.sergeev, atanasyan, PkmX, jsji, llvm-commits
Differential Revision: https://reviews.llvm.org/D53905
llvm-svn: 345775
2018-11-01 04:53:17 +08:00
|
|
|
case EM_PPC64:
|
[ELF][PPC] Allow PT_LOAD to have overlapping p_offset ranges
This change affects the non-linker script case (precisely, when the
`SECTIONS` command is not used). It deletes 3 alignments at PT_LOAD
boundaries for the default case: the size of a powerpc64 binary can be
decreased by at most 192kb. The technique can be ported to other
targets.
Let me demonstrate the idea with a maxPageSize=65536 example:
When assigning the address to the first output section of a new PT_LOAD,
if the end p_vaddr of the previous PT_LOAD is 0x10020, we advance to
the next multiple of maxPageSize: 0x20000. The new PT_LOAD will thus
have p_vaddr=0x20000. Because p_offset and p_vaddr are congruent modulo
maxPageSize, p_offset will be 0x20000, leaving a p_offset gap [0x10020,
0x20000) in the output.
Alternatively, if we advance to 0x20020, the new PT_LOAD will have
p_vaddr=0x20020. We can pick either 0x10020 or 0x20020 for p_offset!
Obviously 0x10020 is the choice because it leaves no gap. At runtime,
p_vaddr will be rounded down by pagesize (65536 if
pagesize=maxPageSize). This PT_LOAD will load additional initial
contents from p_offset ranges [0x10000,0x10020), which will also be
loaded by the previous PT_LOAD. This is fine if -z noseparate-code is in
effect or if we are not transiting between executable and non-executable
segments.
ld.bfd -z noseparate-code leverages this technique to keep output small.
This patch implements the technique in lld, which is mostly effective on
targets with large defaultMaxPageSize (AArch64/MIPS/PPC: 65536). The 3
removed alignments can save almost 3*65536 bytes.
Two places that rely on p_vaddr%pagesize = 0 have to be updated.
1) We used to round p_memsz(PT_GNU_RELRO) up to commonPageSize (defaults
to 4096 on all targets). Now p_vaddr%commonPageSize may be non-zero.
The updated formula takes account of that factor.
2) Our TP offsets formulae are only correct if p_vaddr%p_align = 0.
Fix them. See the updated comments in InputSection.cpp for details.
On targets that we enable the technique (only PPC64 now),
we can potentially make `p_vaddr(PT_TLS)%p_align(PT_TLS) != 0`
if `sh_addralign(.tdata) < sh_addralign(.tbss)`
This exposes many problems in ld.so implementations, especially the
offsets of dynamic TLS blocks. Known issues:
FreeBSD 13.0-CURRENT rtld-elf (i386/amd64/powerpc/arm64)
glibc (HEAD) i386 and x86_64 https://sourceware.org/bugzilla/show_bug.cgi?id=24606
musl<=1.1.22 on TLS Variant I architectures (aarch64/powerpc64/...)
So, force p_vaddr%p_align = 0 by rounding dot up to p_align(PT_TLS).
The technique will be enabled (with updated tests) for other targets in
subsequent patches.
Reviewed By: ruiu
Differential Revision: https://reviews.llvm.org/D64906
llvm-svn: 369343
2019-08-20 16:34:25 +08:00
|
|
|
// Adjusted Variant 1. TP is placed with a displacement of 0x7000, which is
|
|
|
|
// to allow a signed 16-bit offset to reach 0x1000 of TCB/thread-library
|
|
|
|
// data and 0xf000 of the program's TLS segment.
|
|
|
|
return s.getVA(0) + (tls->p_vaddr & (tls->p_align - 1)) - 0x7000;
|
2019-07-02 01:12:26 +08:00
|
|
|
case EM_RISCV:
|
[ELF][PPC] Allow PT_LOAD to have overlapping p_offset ranges
This change affects the non-linker script case (precisely, when the
`SECTIONS` command is not used). It deletes 3 alignments at PT_LOAD
boundaries for the default case: the size of a powerpc64 binary can be
decreased by at most 192kb. The technique can be ported to other
targets.
Let me demonstrate the idea with a maxPageSize=65536 example:
When assigning the address to the first output section of a new PT_LOAD,
if the end p_vaddr of the previous PT_LOAD is 0x10020, we advance to
the next multiple of maxPageSize: 0x20000. The new PT_LOAD will thus
have p_vaddr=0x20000. Because p_offset and p_vaddr are congruent modulo
maxPageSize, p_offset will be 0x20000, leaving a p_offset gap [0x10020,
0x20000) in the output.
Alternatively, if we advance to 0x20020, the new PT_LOAD will have
p_vaddr=0x20020. We can pick either 0x10020 or 0x20020 for p_offset!
Obviously 0x10020 is the choice because it leaves no gap. At runtime,
p_vaddr will be rounded down by pagesize (65536 if
pagesize=maxPageSize). This PT_LOAD will load additional initial
contents from p_offset ranges [0x10000,0x10020), which will also be
loaded by the previous PT_LOAD. This is fine if -z noseparate-code is in
effect or if we are not transiting between executable and non-executable
segments.
ld.bfd -z noseparate-code leverages this technique to keep output small.
This patch implements the technique in lld, which is mostly effective on
targets with large defaultMaxPageSize (AArch64/MIPS/PPC: 65536). The 3
removed alignments can save almost 3*65536 bytes.
Two places that rely on p_vaddr%pagesize = 0 have to be updated.
1) We used to round p_memsz(PT_GNU_RELRO) up to commonPageSize (defaults
to 4096 on all targets). Now p_vaddr%commonPageSize may be non-zero.
The updated formula takes account of that factor.
2) Our TP offsets formulae are only correct if p_vaddr%p_align = 0.
Fix them. See the updated comments in InputSection.cpp for details.
On targets that we enable the technique (only PPC64 now),
we can potentially make `p_vaddr(PT_TLS)%p_align(PT_TLS) != 0`
if `sh_addralign(.tdata) < sh_addralign(.tbss)`
This exposes many problems in ld.so implementations, especially the
offsets of dynamic TLS blocks. Known issues:
FreeBSD 13.0-CURRENT rtld-elf (i386/amd64/powerpc/arm64)
glibc (HEAD) i386 and x86_64 https://sourceware.org/bugzilla/show_bug.cgi?id=24606
musl<=1.1.22 on TLS Variant I architectures (aarch64/powerpc64/...)
So, force p_vaddr%p_align = 0 by rounding dot up to p_align(PT_TLS).
The technique will be enabled (with updated tests) for other targets in
subsequent patches.
Reviewed By: ruiu
Differential Revision: https://reviews.llvm.org/D64906
llvm-svn: 369343
2019-08-20 16:34:25 +08:00
|
|
|
return s.getVA(0) + (tls->p_vaddr & (tls->p_align - 1));
|
|
|
|
|
|
|
|
// Variant 2.
|
2020-01-03 00:27:05 +08:00
|
|
|
case EM_HEXAGON:
|
2020-04-17 22:58:15 +08:00
|
|
|
case EM_SPARCV9:
|
[ELF][PPC] Allow PT_LOAD to have overlapping p_offset ranges
This change affects the non-linker script case (precisely, when the
`SECTIONS` command is not used). It deletes 3 alignments at PT_LOAD
boundaries for the default case: the size of a powerpc64 binary can be
decreased by at most 192kb. The technique can be ported to other
targets.
Let me demonstrate the idea with a maxPageSize=65536 example:
When assigning the address to the first output section of a new PT_LOAD,
if the end p_vaddr of the previous PT_LOAD is 0x10020, we advance to
the next multiple of maxPageSize: 0x20000. The new PT_LOAD will thus
have p_vaddr=0x20000. Because p_offset and p_vaddr are congruent modulo
maxPageSize, p_offset will be 0x20000, leaving a p_offset gap [0x10020,
0x20000) in the output.
Alternatively, if we advance to 0x20020, the new PT_LOAD will have
p_vaddr=0x20020. We can pick either 0x10020 or 0x20020 for p_offset!
Obviously 0x10020 is the choice because it leaves no gap. At runtime,
p_vaddr will be rounded down by pagesize (65536 if
pagesize=maxPageSize). This PT_LOAD will load additional initial
contents from p_offset ranges [0x10000,0x10020), which will also be
loaded by the previous PT_LOAD. This is fine if -z noseparate-code is in
effect or if we are not transiting between executable and non-executable
segments.
ld.bfd -z noseparate-code leverages this technique to keep output small.
This patch implements the technique in lld, which is mostly effective on
targets with large defaultMaxPageSize (AArch64/MIPS/PPC: 65536). The 3
removed alignments can save almost 3*65536 bytes.
Two places that rely on p_vaddr%pagesize = 0 have to be updated.
1) We used to round p_memsz(PT_GNU_RELRO) up to commonPageSize (defaults
to 4096 on all targets). Now p_vaddr%commonPageSize may be non-zero.
The updated formula takes account of that factor.
2) Our TP offsets formulae are only correct if p_vaddr%p_align = 0.
Fix them. See the updated comments in InputSection.cpp for details.
On targets that we enable the technique (only PPC64 now),
we can potentially make `p_vaddr(PT_TLS)%p_align(PT_TLS) != 0`
if `sh_addralign(.tdata) < sh_addralign(.tbss)`
This exposes many problems in ld.so implementations, especially the
offsets of dynamic TLS blocks. Known issues:
FreeBSD 13.0-CURRENT rtld-elf (i386/amd64/powerpc/arm64)
glibc (HEAD) i386 and x86_64 https://sourceware.org/bugzilla/show_bug.cgi?id=24606
musl<=1.1.22 on TLS Variant I architectures (aarch64/powerpc64/...)
So, force p_vaddr%p_align = 0 by rounding dot up to p_align(PT_TLS).
The technique will be enabled (with updated tests) for other targets in
subsequent patches.
Reviewed By: ruiu
Differential Revision: https://reviews.llvm.org/D64906
llvm-svn: 369343
2019-08-20 16:34:25 +08:00
|
|
|
case EM_386:
|
|
|
|
case EM_X86_64:
|
|
|
|
return s.getVA(0) - tls->p_memsz -
|
|
|
|
((-tls->p_vaddr - tls->p_memsz) & (tls->p_align - 1));
|
[ELF] Refactor per-target TLS layout configuration. NFC.
Summary:
There are really three different kinds of TLS layouts:
* A fixed TLS-to-TP offset. On architectures like PowerPC, MIPS, and
RISC-V, the thread pointer points to a fixed offset from the start
of the executable's TLS segment. The offset is 0x7000 for PowerPC
and MIPS, which allows a signed 16-bit offset to reach 0x1000 of
per-thread implementation data and 0xf000 of the application's TLS
segment. The size and layout of the TCB isn't relevant to the static
linker and might not be known.
* A fixed TCB size. This is the format documented as "variant 1" in
Ulrich Drepper's TLS spec. The thread pointer points to a 2-word TCB
followed by the executable's TLS segment. The first word is always
the DTV pointer. Used on ARM. The thread pointer must be aligned to
the TLS segment's alignment, possibly creating alignment padding.
* Variant 2. This format predates variant 1 and is also documented in
Drepper's TLS spec. It allocates the executable's TLS segment before
the thread pointer, apparently for backwards-compatibility. It's
used on x86 and SPARC.
Factor out an lld::elf::getTlsTpOffset() function for use in a
follow-up patch for Android. The TcbSize/TlsTpOffset fields are only used
in getTlsTpOffset, so replace them with a switch on Config->EMachine.
Reviewers: espindola, ruiu, PkmX, jrtc27
Reviewed By: ruiu, PkmX, jrtc27
Subscribers: jyknight, emaste, sdardis, nemanjai, javed.absar, arichardson, kristof.beyls, kbarton, fedor.sergeev, atanasyan, PkmX, jsji, llvm-commits
Differential Revision: https://reviews.llvm.org/D53905
llvm-svn: 345775
2018-11-01 04:53:17 +08:00
|
|
|
default:
|
|
|
|
llvm_unreachable("unhandled Config->EMachine");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-07 21:48:18 +08:00
|
|
|
uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
|
|
|
|
int64_t a, uint64_t p,
|
|
|
|
const Symbol &sym, RelExpr expr) {
|
2016-04-13 09:40:19 +08:00
|
|
|
switch (expr) {
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_ABS:
|
2019-04-23 14:31:44 +08:00
|
|
|
case R_DTPREL:
|
2018-07-10 00:35:51 +08:00
|
|
|
case R_RELAX_TLS_LD_TO_LE_ABS:
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_RELAX_GOT_PC_NOPIC:
|
[ELF][RISCV] Treat R_RISCV_{ADD,SET,SUB}* as link-time constants
R_RISCV_{ADD,SET,SUB}* are used for local label computation.
Add a new RelExpr member R_RISCV_ADD to represent them.
R_RISCV_ADD is treated as a link-time constant because otherwise
R_RISCV_{ADD,SET,SUB}* are not allowed in -pie/-shared mode.
In glibc Scrt1.o, .rela.eh_frame contains such relocations.
Because .eh_frame is not writable, we get this error:
ld.lld: error: can't create dynamic relocation R_RISCV_ADD32 against symbol: .L0 in readonly segment; recompil object files with -fPIC or pass '-Wl,-z,notext' to allow text relocations in the output
>>> defined in ..../riscv64-linux-gnu/lib/Scrt1.o
With D63076 and this patch, I can run -pie/-shared programs linked against glibc.
Note llvm-mc cannot currently produce R_RISCV_SET* so they are not tested.
Reviewed By: ruiu
Differential Revision: https://reviews.llvm.org/D63183
llvm-svn: 363128
2019-06-12 15:53:06 +08:00
|
|
|
case R_RISCV_ADD:
|
2017-11-04 08:31:04 +08:00
|
|
|
return sym.getVA(a);
|
2018-02-16 18:01:17 +08:00
|
|
|
case R_ADDEND:
|
|
|
|
return a;
|
2017-05-18 17:12:21 +08:00
|
|
|
case R_ARM_SBREL:
|
2017-11-04 08:31:04 +08:00
|
|
|
return sym.getVA(a) - getARMStaticBase(sym);
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_GOT:
|
|
|
|
case R_RELAX_TLS_GD_TO_IE_ABS:
|
2017-11-04 08:31:04 +08:00
|
|
|
return sym.getGotVA() + a;
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_GOTONLY_PC:
|
2018-09-26 03:26:58 +08:00
|
|
|
return in.got->getVA() + a - p;
|
[ELF] Change GOT*_FROM_END (relative to end(.got)) to GOTPLT* (start(.got.plt))
Summary:
This should address remaining issues discussed in PR36555.
Currently R_GOT*_FROM_END are exclusively used by x86 and x86_64 to
express relocations types relative to the GOT base. We have
_GLOBAL_OFFSET_TABLE_ (GOT base) = start(.got.plt) but end(.got) !=
start(.got.plt)
This can have problems when _GLOBAL_OFFSET_TABLE_ is used as a symbol, e.g.
glibc dl_machine_dynamic assumes _GLOBAL_OFFSET_TABLE_ is start(.got.plt),
which is not true.
extern const ElfW(Addr) _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
return _GLOBAL_OFFSET_TABLE_[0]; // R_X86_64_GOTPC32
In this patch, we
* Change all GOT*_FROM_END to GOTPLT* to fix the problem.
* Add HasGotPltOffRel to denote whether .got.plt should be kept even if
the section is empty.
* Simplify GotSection::empty and GotPltSection::empty by setting
HasGotOffRel and HasGotPltOffRel according to GlobalOffsetTable early.
The change of R_386_GOTPC makes X86::writePltHeader simpler as we don't
have to compute the offset start(.got.plt) - Ebx (it is constant 0).
We still diverge from ld.bfd (at least in most cases) and gold in that
.got.plt and .got are not adjacent, but the advantage doing that is
unclear.
Reviewers: ruiu, sivachandra, espindola
Subscribers: emaste, mehdi_amini, arichardson, dexonsmith, jdoerfert, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D59594
llvm-svn: 356968
2019-03-26 07:46:19 +08:00
|
|
|
case R_GOTPLTONLY_PC:
|
|
|
|
return in.gotPlt->getVA() + a - p;
|
2016-04-18 20:07:13 +08:00
|
|
|
case R_GOTREL:
|
[PPC64] toc-indirect to toc-relative relaxation
This is based on D54720 by Sean Fertile.
When accessing a global symbol which is not defined in the translation unit,
compilers will generate instructions that load the address from the toc entry.
If the symbol is defined, non-preemptable, and addressable with a 32-bit
signed offset from the toc pointer, the address can be computed
directly. e.g.
addis 3, 2, .LC0@toc@ha # R_PPC64_TOC16_HA
ld 3, .LC0@toc@l(3) # R_PPC64_TOC16_LO_DS, load the address from a .toc entry
ld/lwa 3, 0(3) # load the value from the address
.section .toc,"aw",@progbits
.LC0: .tc var[TC],var
can be relaxed to
addis 3,2,var@toc@ha # this may be relaxed to a nop,
addi 3,3,var@toc@l # then this becomes addi 3,2,var@toc
ld/lwa 3, 0(3) # load the value from the address
We can delete the test ppc64-got-indirect.s as its purpose is covered by
newly added ppc64-toc-relax.s and ppc64-toc-relax-constants.s
Reviewed By: ruiu, sfertile
Differential Revision: https://reviews.llvm.org/D60958
llvm-svn: 360112
2019-05-07 12:26:05 +08:00
|
|
|
case R_PPC64_RELAX_TOC:
|
2018-09-26 03:26:58 +08:00
|
|
|
return sym.getVA(a) - in.got->getVA();
|
[ELF] Change GOT*_FROM_END (relative to end(.got)) to GOTPLT* (start(.got.plt))
Summary:
This should address remaining issues discussed in PR36555.
Currently R_GOT*_FROM_END are exclusively used by x86 and x86_64 to
express relocations types relative to the GOT base. We have
_GLOBAL_OFFSET_TABLE_ (GOT base) = start(.got.plt) but end(.got) !=
start(.got.plt)
This can have problems when _GLOBAL_OFFSET_TABLE_ is used as a symbol, e.g.
glibc dl_machine_dynamic assumes _GLOBAL_OFFSET_TABLE_ is start(.got.plt),
which is not true.
extern const ElfW(Addr) _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
return _GLOBAL_OFFSET_TABLE_[0]; // R_X86_64_GOTPC32
In this patch, we
* Change all GOT*_FROM_END to GOTPLT* to fix the problem.
* Add HasGotPltOffRel to denote whether .got.plt should be kept even if
the section is empty.
* Simplify GotSection::empty and GotPltSection::empty by setting
HasGotOffRel and HasGotPltOffRel according to GlobalOffsetTable early.
The change of R_386_GOTPC makes X86::writePltHeader simpler as we don't
have to compute the offset start(.got.plt) - Ebx (it is constant 0).
We still diverge from ld.bfd (at least in most cases) and gold in that
.got.plt and .got are not adjacent, but the advantage doing that is
unclear.
Reviewers: ruiu, sivachandra, espindola
Subscribers: emaste, mehdi_amini, arichardson, dexonsmith, jdoerfert, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D59594
llvm-svn: 356968
2019-03-26 07:46:19 +08:00
|
|
|
case R_GOTPLTREL:
|
|
|
|
return sym.getVA(a) - in.gotPlt->getVA();
|
|
|
|
case R_GOTPLT:
|
2019-04-22 10:48:37 +08:00
|
|
|
case R_RELAX_TLS_GD_TO_IE_GOTPLT:
|
[ELF] Change GOT*_FROM_END (relative to end(.got)) to GOTPLT* (start(.got.plt))
Summary:
This should address remaining issues discussed in PR36555.
Currently R_GOT*_FROM_END are exclusively used by x86 and x86_64 to
express relocations types relative to the GOT base. We have
_GLOBAL_OFFSET_TABLE_ (GOT base) = start(.got.plt) but end(.got) !=
start(.got.plt)
This can have problems when _GLOBAL_OFFSET_TABLE_ is used as a symbol, e.g.
glibc dl_machine_dynamic assumes _GLOBAL_OFFSET_TABLE_ is start(.got.plt),
which is not true.
extern const ElfW(Addr) _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
return _GLOBAL_OFFSET_TABLE_[0]; // R_X86_64_GOTPC32
In this patch, we
* Change all GOT*_FROM_END to GOTPLT* to fix the problem.
* Add HasGotPltOffRel to denote whether .got.plt should be kept even if
the section is empty.
* Simplify GotSection::empty and GotPltSection::empty by setting
HasGotOffRel and HasGotPltOffRel according to GlobalOffsetTable early.
The change of R_386_GOTPC makes X86::writePltHeader simpler as we don't
have to compute the offset start(.got.plt) - Ebx (it is constant 0).
We still diverge from ld.bfd (at least in most cases) and gold in that
.got.plt and .got are not adjacent, but the advantage doing that is
unclear.
Reviewers: ruiu, sivachandra, espindola
Subscribers: emaste, mehdi_amini, arichardson, dexonsmith, jdoerfert, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D59594
llvm-svn: 356968
2019-03-26 07:46:19 +08:00
|
|
|
return sym.getGotVA() + a - in.gotPlt->getVA();
|
2018-06-27 21:55:41 +08:00
|
|
|
case R_TLSLD_GOT_OFF:
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_GOT_OFF:
|
[PPC64] Thread-local storage general-dynamic to initial-exec relaxation.
Patch adds support for relaxing the general-dynamic tls sequence to
initial-exec.
the relaxation performs the following transformation:
addis r3, r2, x@got@tlsgd@ha --> addis r3, r2, x@got@tprel@ha
addi r3, r3, x@got@tlsgd@l --> ld r3, x@got@tprel@l(r3)
bl __tls_get_addr(x@tlsgd) --> nop
nop --> add r3, r3, r13
and instead of emitting a DTPMOD64/DTPREL64 pair for x, we emit a single
R_PPC64_TPREL64.
Differential Revision: https://reviews.llvm.org/D48090
llvm-svn: 335651
2018-06-27 03:38:18 +08:00
|
|
|
case R_RELAX_TLS_GD_TO_IE_GOT_OFF:
|
2017-11-04 08:31:04 +08:00
|
|
|
return sym.getGotOffset() + a;
|
2018-11-13 18:16:36 +08:00
|
|
|
case R_AARCH64_GOT_PAGE_PC:
|
|
|
|
case R_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC:
|
2017-11-04 08:31:04 +08:00
|
|
|
return getAArch64Page(sym.getGotVA() + a) - getAArch64Page(p);
|
2021-01-14 01:29:16 +08:00
|
|
|
case R_AARCH64_GOT_PAGE:
|
|
|
|
return sym.getGotVA() + a - getAArch64Page(in.got->getVA());
|
2016-04-13 09:40:19 +08:00
|
|
|
case R_GOT_PC:
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_RELAX_TLS_GD_TO_IE:
|
2017-11-04 08:31:04 +08:00
|
|
|
return sym.getGotVA() + a - p;
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_MIPS_GOTREL:
|
2018-09-26 03:26:58 +08:00
|
|
|
return sym.getVA(a) - in.mipsGot->getGp(file);
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_MIPS_GOT_GP:
|
2018-09-26 03:26:58 +08:00
|
|
|
return in.mipsGot->getGp(file) + a;
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_MIPS_GOT_GP_PC: {
|
|
|
|
// R_MIPS_LO16 expression has R_MIPS_GOT_GP_PC type iif the target
|
|
|
|
// is _gp_disp symbol. In that case we should use the following
|
|
|
|
// formula for calculation "AHL + GP - P + 4". For details see p. 4-19 at
|
|
|
|
// ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf
|
2017-11-09 20:10:14 +08:00
|
|
|
// microMIPS variants of these relocations use slightly different
|
|
|
|
// expressions: AHL + GP - P + 3 for %lo() and AHL + GP - P - 1 for %hi()
|
2020-04-02 00:21:08 +08:00
|
|
|
// to correctly handle less-significant bit of the microMIPS symbol.
|
2018-09-26 03:26:58 +08:00
|
|
|
uint64_t v = in.mipsGot->getGp(file) + a - p;
|
2017-09-12 21:08:24 +08:00
|
|
|
if (type == R_MIPS_LO16 || type == R_MICROMIPS_LO16)
|
2017-03-26 12:10:43 +08:00
|
|
|
v += 4;
|
2017-11-09 20:10:14 +08:00
|
|
|
if (type == R_MICROMIPS_LO16 || type == R_MICROMIPS_HI16)
|
|
|
|
v -= 1;
|
2017-03-26 12:10:43 +08:00
|
|
|
return v;
|
|
|
|
}
|
2016-05-16 02:13:50 +08:00
|
|
|
case R_MIPS_GOT_LOCAL_PAGE:
|
2016-03-13 23:37:38 +08:00
|
|
|
// If relocation against MIPS local symbol requires GOT entry, this entry
|
|
|
|
// should be initialized by 'page address'. This address is high 16-bits
|
2016-03-30 20:45:58 +08:00
|
|
|
// of sum the symbol's value and the addend.
|
2018-09-26 03:26:58 +08:00
|
|
|
return in.mipsGot->getVA() + in.mipsGot->getPageEntryOffset(file, sym, a) -
|
|
|
|
in.mipsGot->getGp(file);
|
2016-06-20 05:39:37 +08:00
|
|
|
case R_MIPS_GOT_OFF:
|
2016-10-21 15:22:30 +08:00
|
|
|
case R_MIPS_GOT_OFF32:
|
2016-06-20 05:39:37 +08:00
|
|
|
// In case of MIPS if a GOT relocation has non-zero addend this addend
|
|
|
|
// should be applied to the GOT entry content not to the GOT entry offset.
|
|
|
|
// That is why we use separate expression type.
|
2018-09-26 03:26:58 +08:00
|
|
|
return in.mipsGot->getVA() + in.mipsGot->getSymEntryOffset(file, sym, a) -
|
|
|
|
in.mipsGot->getGp(file);
|
2016-06-23 23:26:31 +08:00
|
|
|
case R_MIPS_TLSGD:
|
2018-09-26 03:26:58 +08:00
|
|
|
return in.mipsGot->getVA() + in.mipsGot->getGlobalDynOffset(file, sym) -
|
|
|
|
in.mipsGot->getGp(file);
|
2016-06-23 23:26:31 +08:00
|
|
|
case R_MIPS_TLSLD:
|
2018-09-26 03:26:58 +08:00
|
|
|
return in.mipsGot->getVA() + in.mipsGot->getTlsIndexOffset(file) -
|
|
|
|
in.mipsGot->getGp(file);
|
2018-11-15 23:35:44 +08:00
|
|
|
case R_AARCH64_PAGE_PC: {
|
2018-12-13 19:13:01 +08:00
|
|
|
uint64_t val = sym.isUndefWeak() ? p + a : sym.getVA(a);
|
2018-11-14 21:53:47 +08:00
|
|
|
return getAArch64Page(val) - getAArch64Page(p);
|
|
|
|
}
|
2018-08-10 01:59:56 +08:00
|
|
|
case R_RISCV_PC_INDIRECT: {
|
2018-11-23 23:13:26 +08:00
|
|
|
if (const Relocation *hiRel = getRISCVPCRelHi20(&sym, a))
|
|
|
|
return getRelocTargetVA(file, hiRel->type, hiRel->addend, sym.getVA(),
|
|
|
|
*hiRel->sym, hiRel->expr);
|
|
|
|
return 0;
|
2018-08-10 01:59:56 +08:00
|
|
|
}
|
2020-02-24 03:46:46 +08:00
|
|
|
case R_PC:
|
|
|
|
case R_ARM_PCA: {
|
2017-06-13 02:05:01 +08:00
|
|
|
uint64_t dest;
|
2020-02-24 03:46:46 +08:00
|
|
|
if (expr == R_ARM_PCA)
|
|
|
|
// Some PC relative ARM (Thumb) relocations align down the place.
|
|
|
|
p = p & 0xfffffffc;
|
2017-11-04 08:31:04 +08:00
|
|
|
if (sym.isUndefWeak()) {
|
2021-06-11 04:25:16 +08:00
|
|
|
// On ARM and AArch64 a branch to an undefined weak resolves to the next
|
|
|
|
// instruction, otherwise the place. On RISCV, resolve an undefined weak
|
|
|
|
// to the same instruction to cause an infinite loop (making the user
|
|
|
|
// aware of the issue) while ensuring no overflow.
|
2017-03-26 12:10:43 +08:00
|
|
|
if (config->emachine == EM_ARM)
|
2017-06-13 02:05:01 +08:00
|
|
|
dest = getARMUndefinedRelativeWeakVA(type, a, p);
|
|
|
|
else if (config->emachine == EM_AARCH64)
|
2021-06-11 04:30:16 +08:00
|
|
|
dest = getAArch64UndefinedRelativeWeakVA(type, p) + a;
|
[PPC32] Improve the 32-bit PowerPC port
Many -static/-no-pie/-shared/-pie applications linked against glibc or musl
should work with this patch. This also helps FreeBSD PowerPC64 to migrate
their lib32 (PR40888).
* Fix default image base and max page size.
* Support new-style Secure PLT (see below). Old-style BSS PLT is not
implemented, so it is not suitable for FreeBSD rtld now because it doesn't
support Secure PLT yet.
* Support more initial relocation types:
R_PPC_ADDR32, R_PPC_REL16*, R_PPC_LOCAL24PC, R_PPC_PLTREL24, and R_PPC_GOT16.
The addend of R_PPC_PLTREL24 is special: it decides the call stub PLT type
but it should be ignored for the computation of target symbol VA.
* Support GNU ifunc
* Support .glink used for lazy PLT resolution in glibc
* Add a new thunk type: PPC32PltCallStub that is similar to PPC64PltCallStub.
It is used by R_PPC_REL24 and R_PPC_PLTREL24.
A PLT stub used in -fPIE/-fPIC usually loads an address relative to
.got2+0x8000 (-fpie/-fpic code uses _GLOBAL_OFFSET_TABLE_ relative
addresses).
Two .got2 sections in two object files have different addresses, thus a PLT stub
can't be shared by two object files. To handle this incompatibility,
change the parameters of Thunk::isCompatibleWith to
`const InputSection &, const Relocation &`.
PowerPC psABI specified an old-style .plt (BSS PLT) that is both
writable and executable. Linkers don't make separate RW- and RWE segments,
which causes all initially writable memory (think .data) executable.
This is a big security concern so a new PLT scheme (secure PLT) was developed to
address the security issue.
TLS will be implemented in D62940.
glibc older than ~2012 requires .rela.dyn to include .rela.plt, it can
not handle the DT_RELA+DT_RELASZ == DT_JMPREL case correctly. A hack
(not included in this patch) in LinkerScript.cpp addOrphanSections() to
work around the issue:
if (Config->EMachine == EM_PPC) {
// Older glibc assumes .rela.dyn includes .rela.plt
Add(In.RelaDyn);
if (In.RelaPlt->isLive() && !In.RelaPlt->Parent)
In.RelaDyn->getParent()->addSection(In.RelaPlt);
}
Reviewed By: ruiu
Differential Revision: https://reviews.llvm.org/D62464
llvm-svn: 362721
2019-06-07 01:03:00 +08:00
|
|
|
else if (config->emachine == EM_PPC)
|
|
|
|
dest = p;
|
2021-06-11 04:25:16 +08:00
|
|
|
else if (config->emachine == EM_RISCV)
|
|
|
|
dest = getRISCVUndefinedRelativeWeakVA(type, p) + a;
|
2017-06-13 02:05:01 +08:00
|
|
|
else
|
2017-11-04 08:31:04 +08:00
|
|
|
dest = sym.getVA(a);
|
2017-06-13 02:05:01 +08:00
|
|
|
} else {
|
2017-11-04 08:31:04 +08:00
|
|
|
dest = sym.getVA(a);
|
2017-03-26 12:10:43 +08:00
|
|
|
}
|
2017-06-13 02:05:01 +08:00
|
|
|
return dest - p;
|
|
|
|
}
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_PLT:
|
2017-11-04 08:31:04 +08:00
|
|
|
return sym.getPltVA() + a;
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_PLT_PC:
|
2019-06-03 14:21:33 +08:00
|
|
|
case R_PPC64_CALL_PLT:
|
2017-11-04 08:31:04 +08:00
|
|
|
return sym.getPltVA() + a - p;
|
2021-10-26 04:05:17 +08:00
|
|
|
case R_PLT_GOTPLT:
|
|
|
|
return sym.getPltVA() + a - in.gotPlt->getVA();
|
[PPC32] Improve the 32-bit PowerPC port
Many -static/-no-pie/-shared/-pie applications linked against glibc or musl
should work with this patch. This also helps FreeBSD PowerPC64 to migrate
their lib32 (PR40888).
* Fix default image base and max page size.
* Support new-style Secure PLT (see below). Old-style BSS PLT is not
implemented, so it is not suitable for FreeBSD rtld now because it doesn't
support Secure PLT yet.
* Support more initial relocation types:
R_PPC_ADDR32, R_PPC_REL16*, R_PPC_LOCAL24PC, R_PPC_PLTREL24, and R_PPC_GOT16.
The addend of R_PPC_PLTREL24 is special: it decides the call stub PLT type
but it should be ignored for the computation of target symbol VA.
* Support GNU ifunc
* Support .glink used for lazy PLT resolution in glibc
* Add a new thunk type: PPC32PltCallStub that is similar to PPC64PltCallStub.
It is used by R_PPC_REL24 and R_PPC_PLTREL24.
A PLT stub used in -fPIE/-fPIC usually loads an address relative to
.got2+0x8000 (-fpie/-fpic code uses _GLOBAL_OFFSET_TABLE_ relative
addresses).
Two .got2 sections in two object files have different addresses, thus a PLT stub
can't be shared by two object files. To handle this incompatibility,
change the parameters of Thunk::isCompatibleWith to
`const InputSection &, const Relocation &`.
PowerPC psABI specified an old-style .plt (BSS PLT) that is both
writable and executable. Linkers don't make separate RW- and RWE segments,
which causes all initially writable memory (think .data) executable.
This is a big security concern so a new PLT scheme (secure PLT) was developed to
address the security issue.
TLS will be implemented in D62940.
glibc older than ~2012 requires .rela.dyn to include .rela.plt, it can
not handle the DT_RELA+DT_RELASZ == DT_JMPREL case correctly. A hack
(not included in this patch) in LinkerScript.cpp addOrphanSections() to
work around the issue:
if (Config->EMachine == EM_PPC) {
// Older glibc assumes .rela.dyn includes .rela.plt
Add(In.RelaDyn);
if (In.RelaPlt->isLive() && !In.RelaPlt->Parent)
In.RelaDyn->getParent()->addSection(In.RelaPlt);
}
Reviewed By: ruiu
Differential Revision: https://reviews.llvm.org/D62464
llvm-svn: 362721
2019-06-07 01:03:00 +08:00
|
|
|
case R_PPC32_PLTREL:
|
|
|
|
// R_PPC_PLTREL24 uses the addend (usually 0 or 0x8000) to indicate r30
|
|
|
|
// stores _GLOBAL_OFFSET_TABLE_ or .got2+0x8000. The addend is ignored for
|
2019-10-29 09:41:38 +08:00
|
|
|
// target VA computation.
|
[PPC32] Improve the 32-bit PowerPC port
Many -static/-no-pie/-shared/-pie applications linked against glibc or musl
should work with this patch. This also helps FreeBSD PowerPC64 to migrate
their lib32 (PR40888).
* Fix default image base and max page size.
* Support new-style Secure PLT (see below). Old-style BSS PLT is not
implemented, so it is not suitable for FreeBSD rtld now because it doesn't
support Secure PLT yet.
* Support more initial relocation types:
R_PPC_ADDR32, R_PPC_REL16*, R_PPC_LOCAL24PC, R_PPC_PLTREL24, and R_PPC_GOT16.
The addend of R_PPC_PLTREL24 is special: it decides the call stub PLT type
but it should be ignored for the computation of target symbol VA.
* Support GNU ifunc
* Support .glink used for lazy PLT resolution in glibc
* Add a new thunk type: PPC32PltCallStub that is similar to PPC64PltCallStub.
It is used by R_PPC_REL24 and R_PPC_PLTREL24.
A PLT stub used in -fPIE/-fPIC usually loads an address relative to
.got2+0x8000 (-fpie/-fpic code uses _GLOBAL_OFFSET_TABLE_ relative
addresses).
Two .got2 sections in two object files have different addresses, thus a PLT stub
can't be shared by two object files. To handle this incompatibility,
change the parameters of Thunk::isCompatibleWith to
`const InputSection &, const Relocation &`.
PowerPC psABI specified an old-style .plt (BSS PLT) that is both
writable and executable. Linkers don't make separate RW- and RWE segments,
which causes all initially writable memory (think .data) executable.
This is a big security concern so a new PLT scheme (secure PLT) was developed to
address the security issue.
TLS will be implemented in D62940.
glibc older than ~2012 requires .rela.dyn to include .rela.plt, it can
not handle the DT_RELA+DT_RELASZ == DT_JMPREL case correctly. A hack
(not included in this patch) in LinkerScript.cpp addOrphanSections() to
work around the issue:
if (Config->EMachine == EM_PPC) {
// Older glibc assumes .rela.dyn includes .rela.plt
Add(In.RelaDyn);
if (In.RelaPlt->isLive() && !In.RelaPlt->Parent)
In.RelaDyn->getParent()->addSection(In.RelaPlt);
}
Reviewed By: ruiu
Differential Revision: https://reviews.llvm.org/D62464
llvm-svn: 362721
2019-06-07 01:03:00 +08:00
|
|
|
return sym.getPltVA() - p;
|
2019-06-03 14:21:33 +08:00
|
|
|
case R_PPC64_CALL: {
|
2017-11-04 08:31:04 +08:00
|
|
|
uint64_t symVA = sym.getVA(a);
|
2016-04-13 09:40:19 +08:00
|
|
|
// If we have an undefined weak symbol, we might get here with a symbol
|
|
|
|
// address of zero. That could overflow, but the code must be unreachable,
|
|
|
|
// so don't bother doing anything at all.
|
|
|
|
if (!symVA)
|
|
|
|
return 0;
|
2018-04-27 23:41:19 +08:00
|
|
|
|
|
|
|
// PPC64 V2 ABI describes two entry points to a function. The global entry
|
2018-09-20 08:26:47 +08:00
|
|
|
// point is used for calls where the caller and callee (may) have different
|
|
|
|
// TOC base pointers and r2 needs to be modified to hold the TOC base for
|
|
|
|
// the callee. For local calls the caller and callee share the same
|
|
|
|
// TOC base and so the TOC pointer initialization code should be skipped by
|
|
|
|
// branching to the local entry point.
|
|
|
|
return symVA - p + getPPC64GlobalEntryToLocalEntryOffset(sym.stOther);
|
2016-04-13 09:40:19 +08:00
|
|
|
}
|
2019-06-03 14:21:33 +08:00
|
|
|
case R_PPC64_TOCBASE:
|
2017-03-26 12:10:43 +08:00
|
|
|
return getPPC64TocBase() + a;
|
2016-05-25 22:31:37 +08:00
|
|
|
case R_RELAX_GOT_PC:
|
2020-08-17 22:30:14 +08:00
|
|
|
case R_PPC64_RELAX_GOT_PC:
|
2017-11-04 08:31:04 +08:00
|
|
|
return sym.getVA(a) - p;
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_RELAX_TLS_GD_TO_LE:
|
|
|
|
case R_RELAX_TLS_IE_TO_LE:
|
|
|
|
case R_RELAX_TLS_LD_TO_LE:
|
2020-12-19 00:24:42 +08:00
|
|
|
case R_TPREL:
|
2019-05-20 19:47:31 +08:00
|
|
|
// It is not very clear what to return if the symbol is undefined. With
|
|
|
|
// --noinhibit-exec, even a non-weak undefined reference may reach here.
|
|
|
|
// Just return A, which matches R_ABS, and the behavior of some dynamic
|
|
|
|
// loaders.
|
2021-10-29 12:03:53 +08:00
|
|
|
if (sym.isUndefined())
|
2019-05-20 19:47:31 +08:00
|
|
|
return a;
|
2019-05-30 18:00:20 +08:00
|
|
|
return getTlsTpOffset(sym) + a;
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_RELAX_TLS_GD_TO_LE_NEG:
|
2020-12-19 00:24:42 +08:00
|
|
|
case R_TPREL_NEG:
|
2019-05-20 19:47:31 +08:00
|
|
|
if (sym.isUndefined())
|
|
|
|
return a;
|
2019-05-30 18:00:20 +08:00
|
|
|
return -getTlsTpOffset(sym) + a;
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_SIZE:
|
2018-01-06 05:41:17 +08:00
|
|
|
return sym.getSize() + a;
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_TLSDESC:
|
2022-01-11 02:03:21 +08:00
|
|
|
return in.got->getTlsDescAddr(sym) + a;
|
2019-05-29 10:03:56 +08:00
|
|
|
case R_TLSDESC_PC:
|
2022-01-11 02:03:21 +08:00
|
|
|
return in.got->getTlsDescAddr(sym) + a - p;
|
2021-10-29 08:52:03 +08:00
|
|
|
case R_TLSDESC_GOTPLT:
|
2022-01-11 02:03:21 +08:00
|
|
|
return in.got->getTlsDescAddr(sym) + a - in.gotPlt->getVA();
|
2018-11-15 23:35:44 +08:00
|
|
|
case R_AARCH64_TLSDESC_PAGE:
|
2022-01-11 02:03:21 +08:00
|
|
|
return getAArch64Page(in.got->getTlsDescAddr(sym) + a) - getAArch64Page(p);
|
2018-05-29 22:34:38 +08:00
|
|
|
case R_TLSGD_GOT:
|
2018-09-26 03:26:58 +08:00
|
|
|
return in.got->getGlobalDynOffset(sym) + a;
|
[ELF] Change GOT*_FROM_END (relative to end(.got)) to GOTPLT* (start(.got.plt))
Summary:
This should address remaining issues discussed in PR36555.
Currently R_GOT*_FROM_END are exclusively used by x86 and x86_64 to
express relocations types relative to the GOT base. We have
_GLOBAL_OFFSET_TABLE_ (GOT base) = start(.got.plt) but end(.got) !=
start(.got.plt)
This can have problems when _GLOBAL_OFFSET_TABLE_ is used as a symbol, e.g.
glibc dl_machine_dynamic assumes _GLOBAL_OFFSET_TABLE_ is start(.got.plt),
which is not true.
extern const ElfW(Addr) _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
return _GLOBAL_OFFSET_TABLE_[0]; // R_X86_64_GOTPC32
In this patch, we
* Change all GOT*_FROM_END to GOTPLT* to fix the problem.
* Add HasGotPltOffRel to denote whether .got.plt should be kept even if
the section is empty.
* Simplify GotSection::empty and GotPltSection::empty by setting
HasGotOffRel and HasGotPltOffRel according to GlobalOffsetTable early.
The change of R_386_GOTPC makes X86::writePltHeader simpler as we don't
have to compute the offset start(.got.plt) - Ebx (it is constant 0).
We still diverge from ld.bfd (at least in most cases) and gold in that
.got.plt and .got are not adjacent, but the advantage doing that is
unclear.
Reviewers: ruiu, sivachandra, espindola
Subscribers: emaste, mehdi_amini, arichardson, dexonsmith, jdoerfert, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D59594
llvm-svn: 356968
2019-03-26 07:46:19 +08:00
|
|
|
case R_TLSGD_GOTPLT:
|
2021-04-17 10:08:23 +08:00
|
|
|
return in.got->getGlobalDynAddr(sym) + a - in.gotPlt->getVA();
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_TLSGD_PC:
|
2018-09-26 03:26:58 +08:00
|
|
|
return in.got->getGlobalDynAddr(sym) + a - p;
|
[ELF] Change GOT*_FROM_END (relative to end(.got)) to GOTPLT* (start(.got.plt))
Summary:
This should address remaining issues discussed in PR36555.
Currently R_GOT*_FROM_END are exclusively used by x86 and x86_64 to
express relocations types relative to the GOT base. We have
_GLOBAL_OFFSET_TABLE_ (GOT base) = start(.got.plt) but end(.got) !=
start(.got.plt)
This can have problems when _GLOBAL_OFFSET_TABLE_ is used as a symbol, e.g.
glibc dl_machine_dynamic assumes _GLOBAL_OFFSET_TABLE_ is start(.got.plt),
which is not true.
extern const ElfW(Addr) _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
return _GLOBAL_OFFSET_TABLE_[0]; // R_X86_64_GOTPC32
In this patch, we
* Change all GOT*_FROM_END to GOTPLT* to fix the problem.
* Add HasGotPltOffRel to denote whether .got.plt should be kept even if
the section is empty.
* Simplify GotSection::empty and GotPltSection::empty by setting
HasGotOffRel and HasGotPltOffRel according to GlobalOffsetTable early.
The change of R_386_GOTPC makes X86::writePltHeader simpler as we don't
have to compute the offset start(.got.plt) - Ebx (it is constant 0).
We still diverge from ld.bfd (at least in most cases) and gold in that
.got.plt and .got are not adjacent, but the advantage doing that is
unclear.
Reviewers: ruiu, sivachandra, espindola
Subscribers: emaste, mehdi_amini, arichardson, dexonsmith, jdoerfert, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D59594
llvm-svn: 356968
2019-03-26 07:46:19 +08:00
|
|
|
case R_TLSLD_GOTPLT:
|
|
|
|
return in.got->getVA() + in.got->getTlsIndexOff() + a - in.gotPlt->getVA();
|
2018-06-01 02:44:12 +08:00
|
|
|
case R_TLSLD_GOT:
|
2018-09-26 03:26:58 +08:00
|
|
|
return in.got->getTlsIndexOff() + a;
|
2017-03-26 12:10:43 +08:00
|
|
|
case R_TLSLD_PC:
|
2018-09-26 03:26:58 +08:00
|
|
|
return in.got->getTlsIndexVA() + a - p;
|
2018-08-02 22:34:39 +08:00
|
|
|
default:
|
|
|
|
llvm_unreachable("invalid expression");
|
2016-04-13 09:40:19 +08:00
|
|
|
}
|
2016-03-13 23:37:38 +08:00
|
|
|
}
|
|
|
|
|
2016-04-29 02:42:04 +08:00
|
|
|
// This function applies relocations to sections without SHF_ALLOC bit.
|
|
|
|
// Such sections are never mapped to memory at runtime. Debug sections are
|
|
|
|
// an example. Relocations in non-alloc sections are much easier to
|
|
|
|
// handle than in allocated sections because it will never need complex
|
2019-10-31 19:54:16 +08:00
|
|
|
// treatment such as GOT or PLT (because at runtime no one refers them).
|
2016-04-29 02:42:04 +08:00
|
|
|
// So, we handle relocations for non-alloc sections directly in this
|
|
|
|
// function as a performance optimization.
|
2017-02-24 00:49:07 +08:00
|
|
|
template <class ELFT, class RelTy>
|
|
|
|
void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
|
2017-10-10 12:45:48 +08:00
|
|
|
const unsigned bits = sizeof(typename ELFT::uint) * 8;
|
2021-12-25 09:54:12 +08:00
|
|
|
const TargetInfo &target = *elf::target;
|
[ELF] Resolve relocations in .debug_* referencing (discarded symbols or ICF folded section symbols) to tombstone values
See D59553, https://lists.llvm.org/pipermail/llvm-dev/2020-May/141885.html and
https://sourceware.org/pipermail/binutils/2020-May/111357.html for
extensive discussions on a tombstone value.
See http://www.dwarfstd.org/ShowIssue.php?issue=200609.1
(Reserve an address value for "not present") for a DWARF enhancement proposal.
We resolve such relocations to a tombstone value to indicate that the address is invalid.
This solves several problems (the normal behavior is to resolve the relocation to the addend):
* For an empty function in a collected section, a pair of (0,0) can
terminate .debug_loc and .debug_ranges (as of binutils 2.34, GNU ld
resolves such a relocation to 1 to avoid the .debug_ranges issue)
* If DW_AT_high_pc is sufficiently large, the address range can collide
with a regular code range of low address (https://bugs.llvm.org/show_bug.cgi?id=41124 )
* If a text section is folded into another by ICF, we may leave entries
in multiple CUs claiming ownership of the same range of code, which can
confuse consumers.
* Debug information associated with COMDAT sections can have problems
similar to ICF, but is more complex - thus not addressed by this patch.
For pre-DWARF-v5 .debug_loc and .debug_ranges, a pair of 0 can terminate
entries (invalidating subsequent ranges).
-1 is a reserved value with special meaning (base address selection entry) which can't be used either.
Use -2 instead.
For all other .debug_*, use UINT32_MAX for 32-bit targets and UINT64_MAX
for 64-bit targets. In the code, we intentionally use
`uint64_t tombstone = UINT64_MAX` for 32-bit targets as well: this matches
SignExtend64 as used in `relocateAlloc`. (Actually UINT32_MAX does not work for R_386_32)
Note 0, we only special case `target->symbolicRel` (R_X86_64_64, R_AARCH64_ABS64, R_PPC64_ADDR64), not
short-range absolute relocations (e.g. R_X86_64_32). Only forms like DW_FORM_addr need to be special cased.
They can hold an arbitrary address (must be 64-bit on a 64-bit target). (In theory,
producers can make use of small code model to emit 32-bit relocations. This doesn't seem to be leveraged.)
Note 1, we have to ignore the addend, because we don't want to resolve
DW_AT_low_pc (which may have a non-zero addend) to -1+addend (wrap
around to a low address):
__attribute__((section(".text.x"))) void f1() { }
__attribute__((section(".text.x"))) void f2() { } // DW_AT_low_pc has a non-zero addend
Note 2, if the prevailing copy does not have debugging information while
a non-prevailing copy has (partial debug build), we don't do extra work
to attach debugging information to the prevailing definition. (clang
has a lot of debug info optimizations that are on-by-default that assume
the whole program is built with debug info).
clang -c -ffunction-sections a.cc # prevailing copy has no debug info
clang -c -ffunction-sections -g b.cc
Reviewed By: dblaikie, avl, jhenderson
Differential Revision: https://reviews.llvm.org/D81784
2020-06-24 02:06:39 +08:00
|
|
|
const bool isDebug = isDebugSection(*this);
|
|
|
|
const bool isDebugLocOrRanges =
|
|
|
|
isDebug && (name == ".debug_loc" || name == ".debug_ranges");
|
2020-07-02 04:37:20 +08:00
|
|
|
const bool isDebugLine = isDebug && name == ".debug_line";
|
2020-07-09 01:10:43 +08:00
|
|
|
Optional<uint64_t> tombstone;
|
|
|
|
for (const auto &patAndValue : llvm::reverse(config->deadRelocInNonAlloc))
|
|
|
|
if (patAndValue.first.match(this->name)) {
|
|
|
|
tombstone = patAndValue.second;
|
|
|
|
break;
|
|
|
|
}
|
2017-10-10 12:45:48 +08:00
|
|
|
|
2016-04-29 02:42:04 +08:00
|
|
|
for (const RelTy &rel : rels) {
|
2017-10-12 06:49:24 +08:00
|
|
|
RelType type = rel.getType(config->isMips64EL);
|
2018-02-16 09:10:51 +08:00
|
|
|
|
|
|
|
// GCC 8.0 or earlier have a bug that they emit R_386_GOTPC relocations
|
|
|
|
// against _GLOBAL_OFFSET_TABLE_ for .debug_info. The bug has been fixed
|
|
|
|
// in 2017 (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82630), but we
|
|
|
|
// need to keep this bug-compatible code for a while.
|
|
|
|
if (config->emachine == EM_386 && type == R_386_GOTPC)
|
|
|
|
continue;
|
|
|
|
|
2020-08-10 23:57:19 +08:00
|
|
|
uint64_t offset = rel.r_offset;
|
2016-05-22 03:48:54 +08:00
|
|
|
uint8_t *bufLoc = buf + offset;
|
2017-02-16 08:12:34 +08:00
|
|
|
int64_t addend = getAddend<ELFT>(rel);
|
2016-04-29 11:21:08 +08:00
|
|
|
if (!RelTy::IsRela)
|
2021-12-25 09:54:12 +08:00
|
|
|
addend += target.getImplicitAddend(bufLoc, type);
|
2016-04-29 02:42:04 +08:00
|
|
|
|
2017-12-20 00:29:02 +08:00
|
|
|
Symbol &sym = getFile<ELFT>()->getRelocTargetSym(rel);
|
2021-12-25 09:54:12 +08:00
|
|
|
RelExpr expr = target.getRelExpr(type, sym, bufLoc);
|
2017-02-23 14:22:28 +08:00
|
|
|
if (expr == R_NONE)
|
|
|
|
continue;
|
2018-02-16 09:10:51 +08:00
|
|
|
|
2020-07-09 01:10:43 +08:00
|
|
|
if (tombstone ||
|
2021-12-25 09:54:12 +08:00
|
|
|
(isDebug && (type == target.symbolicRel || expr == R_DTPREL))) {
|
[ELF] Resolve relocations in .debug_* referencing (discarded symbols or ICF folded section symbols) to tombstone values
See D59553, https://lists.llvm.org/pipermail/llvm-dev/2020-May/141885.html and
https://sourceware.org/pipermail/binutils/2020-May/111357.html for
extensive discussions on a tombstone value.
See http://www.dwarfstd.org/ShowIssue.php?issue=200609.1
(Reserve an address value for "not present") for a DWARF enhancement proposal.
We resolve such relocations to a tombstone value to indicate that the address is invalid.
This solves several problems (the normal behavior is to resolve the relocation to the addend):
* For an empty function in a collected section, a pair of (0,0) can
terminate .debug_loc and .debug_ranges (as of binutils 2.34, GNU ld
resolves such a relocation to 1 to avoid the .debug_ranges issue)
* If DW_AT_high_pc is sufficiently large, the address range can collide
with a regular code range of low address (https://bugs.llvm.org/show_bug.cgi?id=41124 )
* If a text section is folded into another by ICF, we may leave entries
in multiple CUs claiming ownership of the same range of code, which can
confuse consumers.
* Debug information associated with COMDAT sections can have problems
similar to ICF, but is more complex - thus not addressed by this patch.
For pre-DWARF-v5 .debug_loc and .debug_ranges, a pair of 0 can terminate
entries (invalidating subsequent ranges).
-1 is a reserved value with special meaning (base address selection entry) which can't be used either.
Use -2 instead.
For all other .debug_*, use UINT32_MAX for 32-bit targets and UINT64_MAX
for 64-bit targets. In the code, we intentionally use
`uint64_t tombstone = UINT64_MAX` for 32-bit targets as well: this matches
SignExtend64 as used in `relocateAlloc`. (Actually UINT32_MAX does not work for R_386_32)
Note 0, we only special case `target->symbolicRel` (R_X86_64_64, R_AARCH64_ABS64, R_PPC64_ADDR64), not
short-range absolute relocations (e.g. R_X86_64_32). Only forms like DW_FORM_addr need to be special cased.
They can hold an arbitrary address (must be 64-bit on a 64-bit target). (In theory,
producers can make use of small code model to emit 32-bit relocations. This doesn't seem to be leveraged.)
Note 1, we have to ignore the addend, because we don't want to resolve
DW_AT_low_pc (which may have a non-zero addend) to -1+addend (wrap
around to a low address):
__attribute__((section(".text.x"))) void f1() { }
__attribute__((section(".text.x"))) void f2() { } // DW_AT_low_pc has a non-zero addend
Note 2, if the prevailing copy does not have debugging information while
a non-prevailing copy has (partial debug build), we don't do extra work
to attach debugging information to the prevailing definition. (clang
has a lot of debug info optimizations that are on-by-default that assume
the whole program is built with debug info).
clang -c -ffunction-sections a.cc # prevailing copy has no debug info
clang -c -ffunction-sections -g b.cc
Reviewed By: dblaikie, avl, jhenderson
Differential Revision: https://reviews.llvm.org/D81784
2020-06-24 02:06:39 +08:00
|
|
|
// Resolve relocations in .debug_* referencing (discarded symbols or ICF
|
|
|
|
// folded section symbols) to a tombstone value. Resolving to addend is
|
|
|
|
// unsatisfactory because the result address range may collide with a
|
|
|
|
// valid range of low address, or leave multiple CUs claiming ownership of
|
|
|
|
// the same range of code, which may confuse consumers.
|
|
|
|
//
|
|
|
|
// To address the problems, we use -1 as a tombstone value for most
|
|
|
|
// .debug_* sections. We have to ignore the addend because we don't want
|
|
|
|
// to resolve an address attribute (which may have a non-zero addend) to
|
|
|
|
// -1+addend (wrap around to a low address).
|
|
|
|
//
|
2020-07-04 00:50:30 +08:00
|
|
|
// R_DTPREL type relocations represent an offset into the dynamic thread
|
|
|
|
// vector. The computed value is st_value plus a non-negative offset.
|
|
|
|
// Negative values are invalid, so -1 can be used as the tombstone value.
|
|
|
|
//
|
[ELF] Resolve relocations in .debug_* referencing (discarded symbols or ICF folded section symbols) to tombstone values
See D59553, https://lists.llvm.org/pipermail/llvm-dev/2020-May/141885.html and
https://sourceware.org/pipermail/binutils/2020-May/111357.html for
extensive discussions on a tombstone value.
See http://www.dwarfstd.org/ShowIssue.php?issue=200609.1
(Reserve an address value for "not present") for a DWARF enhancement proposal.
We resolve such relocations to a tombstone value to indicate that the address is invalid.
This solves several problems (the normal behavior is to resolve the relocation to the addend):
* For an empty function in a collected section, a pair of (0,0) can
terminate .debug_loc and .debug_ranges (as of binutils 2.34, GNU ld
resolves such a relocation to 1 to avoid the .debug_ranges issue)
* If DW_AT_high_pc is sufficiently large, the address range can collide
with a regular code range of low address (https://bugs.llvm.org/show_bug.cgi?id=41124 )
* If a text section is folded into another by ICF, we may leave entries
in multiple CUs claiming ownership of the same range of code, which can
confuse consumers.
* Debug information associated with COMDAT sections can have problems
similar to ICF, but is more complex - thus not addressed by this patch.
For pre-DWARF-v5 .debug_loc and .debug_ranges, a pair of 0 can terminate
entries (invalidating subsequent ranges).
-1 is a reserved value with special meaning (base address selection entry) which can't be used either.
Use -2 instead.
For all other .debug_*, use UINT32_MAX for 32-bit targets and UINT64_MAX
for 64-bit targets. In the code, we intentionally use
`uint64_t tombstone = UINT64_MAX` for 32-bit targets as well: this matches
SignExtend64 as used in `relocateAlloc`. (Actually UINT32_MAX does not work for R_386_32)
Note 0, we only special case `target->symbolicRel` (R_X86_64_64, R_AARCH64_ABS64, R_PPC64_ADDR64), not
short-range absolute relocations (e.g. R_X86_64_32). Only forms like DW_FORM_addr need to be special cased.
They can hold an arbitrary address (must be 64-bit on a 64-bit target). (In theory,
producers can make use of small code model to emit 32-bit relocations. This doesn't seem to be leveraged.)
Note 1, we have to ignore the addend, because we don't want to resolve
DW_AT_low_pc (which may have a non-zero addend) to -1+addend (wrap
around to a low address):
__attribute__((section(".text.x"))) void f1() { }
__attribute__((section(".text.x"))) void f2() { } // DW_AT_low_pc has a non-zero addend
Note 2, if the prevailing copy does not have debugging information while
a non-prevailing copy has (partial debug build), we don't do extra work
to attach debugging information to the prevailing definition. (clang
has a lot of debug info optimizations that are on-by-default that assume
the whole program is built with debug info).
clang -c -ffunction-sections a.cc # prevailing copy has no debug info
clang -c -ffunction-sections -g b.cc
Reviewed By: dblaikie, avl, jhenderson
Differential Revision: https://reviews.llvm.org/D81784
2020-06-24 02:06:39 +08:00
|
|
|
// If the referenced symbol is discarded (made Undefined), or the
|
|
|
|
// section defining the referenced symbol is garbage collected,
|
2021-12-25 04:09:48 +08:00
|
|
|
// sym.getOutputSection() is nullptr. `ds->folded` catches the ICF folded
|
|
|
|
// case. However, resolving a relocation in .debug_line to -1 would stop
|
|
|
|
// debugger users from setting breakpoints on the folded-in function, so
|
|
|
|
// exclude .debug_line.
|
[ELF] Resolve relocations in .debug_* referencing (discarded symbols or ICF folded section symbols) to tombstone values
See D59553, https://lists.llvm.org/pipermail/llvm-dev/2020-May/141885.html and
https://sourceware.org/pipermail/binutils/2020-May/111357.html for
extensive discussions on a tombstone value.
See http://www.dwarfstd.org/ShowIssue.php?issue=200609.1
(Reserve an address value for "not present") for a DWARF enhancement proposal.
We resolve such relocations to a tombstone value to indicate that the address is invalid.
This solves several problems (the normal behavior is to resolve the relocation to the addend):
* For an empty function in a collected section, a pair of (0,0) can
terminate .debug_loc and .debug_ranges (as of binutils 2.34, GNU ld
resolves such a relocation to 1 to avoid the .debug_ranges issue)
* If DW_AT_high_pc is sufficiently large, the address range can collide
with a regular code range of low address (https://bugs.llvm.org/show_bug.cgi?id=41124 )
* If a text section is folded into another by ICF, we may leave entries
in multiple CUs claiming ownership of the same range of code, which can
confuse consumers.
* Debug information associated with COMDAT sections can have problems
similar to ICF, but is more complex - thus not addressed by this patch.
For pre-DWARF-v5 .debug_loc and .debug_ranges, a pair of 0 can terminate
entries (invalidating subsequent ranges).
-1 is a reserved value with special meaning (base address selection entry) which can't be used either.
Use -2 instead.
For all other .debug_*, use UINT32_MAX for 32-bit targets and UINT64_MAX
for 64-bit targets. In the code, we intentionally use
`uint64_t tombstone = UINT64_MAX` for 32-bit targets as well: this matches
SignExtend64 as used in `relocateAlloc`. (Actually UINT32_MAX does not work for R_386_32)
Note 0, we only special case `target->symbolicRel` (R_X86_64_64, R_AARCH64_ABS64, R_PPC64_ADDR64), not
short-range absolute relocations (e.g. R_X86_64_32). Only forms like DW_FORM_addr need to be special cased.
They can hold an arbitrary address (must be 64-bit on a 64-bit target). (In theory,
producers can make use of small code model to emit 32-bit relocations. This doesn't seem to be leveraged.)
Note 1, we have to ignore the addend, because we don't want to resolve
DW_AT_low_pc (which may have a non-zero addend) to -1+addend (wrap
around to a low address):
__attribute__((section(".text.x"))) void f1() { }
__attribute__((section(".text.x"))) void f2() { } // DW_AT_low_pc has a non-zero addend
Note 2, if the prevailing copy does not have debugging information while
a non-prevailing copy has (partial debug build), we don't do extra work
to attach debugging information to the prevailing definition. (clang
has a lot of debug info optimizations that are on-by-default that assume
the whole program is built with debug info).
clang -c -ffunction-sections a.cc # prevailing copy has no debug info
clang -c -ffunction-sections -g b.cc
Reviewed By: dblaikie, avl, jhenderson
Differential Revision: https://reviews.llvm.org/D81784
2020-06-24 02:06:39 +08:00
|
|
|
//
|
|
|
|
// For pre-DWARF-v5 .debug_loc and .debug_ranges, -1 is a reserved value
|
2020-08-07 03:34:16 +08:00
|
|
|
// (base address selection entry), use 1 (which is used by GNU ld for
|
|
|
|
// .debug_ranges).
|
|
|
|
//
|
|
|
|
// TODO To reduce disruption, we use 0 instead of -1 as the tombstone
|
|
|
|
// value. Enable -1 in a future release.
|
[ELF] Resolve relocations in .debug_* referencing (discarded symbols or ICF folded section symbols) to tombstone values
See D59553, https://lists.llvm.org/pipermail/llvm-dev/2020-May/141885.html and
https://sourceware.org/pipermail/binutils/2020-May/111357.html for
extensive discussions on a tombstone value.
See http://www.dwarfstd.org/ShowIssue.php?issue=200609.1
(Reserve an address value for "not present") for a DWARF enhancement proposal.
We resolve such relocations to a tombstone value to indicate that the address is invalid.
This solves several problems (the normal behavior is to resolve the relocation to the addend):
* For an empty function in a collected section, a pair of (0,0) can
terminate .debug_loc and .debug_ranges (as of binutils 2.34, GNU ld
resolves such a relocation to 1 to avoid the .debug_ranges issue)
* If DW_AT_high_pc is sufficiently large, the address range can collide
with a regular code range of low address (https://bugs.llvm.org/show_bug.cgi?id=41124 )
* If a text section is folded into another by ICF, we may leave entries
in multiple CUs claiming ownership of the same range of code, which can
confuse consumers.
* Debug information associated with COMDAT sections can have problems
similar to ICF, but is more complex - thus not addressed by this patch.
For pre-DWARF-v5 .debug_loc and .debug_ranges, a pair of 0 can terminate
entries (invalidating subsequent ranges).
-1 is a reserved value with special meaning (base address selection entry) which can't be used either.
Use -2 instead.
For all other .debug_*, use UINT32_MAX for 32-bit targets and UINT64_MAX
for 64-bit targets. In the code, we intentionally use
`uint64_t tombstone = UINT64_MAX` for 32-bit targets as well: this matches
SignExtend64 as used in `relocateAlloc`. (Actually UINT32_MAX does not work for R_386_32)
Note 0, we only special case `target->symbolicRel` (R_X86_64_64, R_AARCH64_ABS64, R_PPC64_ADDR64), not
short-range absolute relocations (e.g. R_X86_64_32). Only forms like DW_FORM_addr need to be special cased.
They can hold an arbitrary address (must be 64-bit on a 64-bit target). (In theory,
producers can make use of small code model to emit 32-bit relocations. This doesn't seem to be leveraged.)
Note 1, we have to ignore the addend, because we don't want to resolve
DW_AT_low_pc (which may have a non-zero addend) to -1+addend (wrap
around to a low address):
__attribute__((section(".text.x"))) void f1() { }
__attribute__((section(".text.x"))) void f2() { } // DW_AT_low_pc has a non-zero addend
Note 2, if the prevailing copy does not have debugging information while
a non-prevailing copy has (partial debug build), we don't do extra work
to attach debugging information to the prevailing definition. (clang
has a lot of debug info optimizations that are on-by-default that assume
the whole program is built with debug info).
clang -c -ffunction-sections a.cc # prevailing copy has no debug info
clang -c -ffunction-sections -g b.cc
Reviewed By: dblaikie, avl, jhenderson
Differential Revision: https://reviews.llvm.org/D81784
2020-06-24 02:06:39 +08:00
|
|
|
auto *ds = dyn_cast<Defined>(&sym);
|
2021-12-25 04:09:48 +08:00
|
|
|
if (!sym.getOutputSection() || (ds && ds->folded && !isDebugLine)) {
|
2020-07-09 01:10:43 +08:00
|
|
|
// If -z dead-reloc-in-nonalloc= is specified, respect it.
|
2020-08-07 03:34:16 +08:00
|
|
|
const uint64_t value = tombstone ? SignExtend64<bits>(*tombstone)
|
|
|
|
: (isDebugLocOrRanges ? 1 : 0);
|
2021-12-25 09:54:12 +08:00
|
|
|
target.relocateNoSym(bufLoc, type, value);
|
[ELF] Resolve relocations in .debug_* referencing (discarded symbols or ICF folded section symbols) to tombstone values
See D59553, https://lists.llvm.org/pipermail/llvm-dev/2020-May/141885.html and
https://sourceware.org/pipermail/binutils/2020-May/111357.html for
extensive discussions on a tombstone value.
See http://www.dwarfstd.org/ShowIssue.php?issue=200609.1
(Reserve an address value for "not present") for a DWARF enhancement proposal.
We resolve such relocations to a tombstone value to indicate that the address is invalid.
This solves several problems (the normal behavior is to resolve the relocation to the addend):
* For an empty function in a collected section, a pair of (0,0) can
terminate .debug_loc and .debug_ranges (as of binutils 2.34, GNU ld
resolves such a relocation to 1 to avoid the .debug_ranges issue)
* If DW_AT_high_pc is sufficiently large, the address range can collide
with a regular code range of low address (https://bugs.llvm.org/show_bug.cgi?id=41124 )
* If a text section is folded into another by ICF, we may leave entries
in multiple CUs claiming ownership of the same range of code, which can
confuse consumers.
* Debug information associated with COMDAT sections can have problems
similar to ICF, but is more complex - thus not addressed by this patch.
For pre-DWARF-v5 .debug_loc and .debug_ranges, a pair of 0 can terminate
entries (invalidating subsequent ranges).
-1 is a reserved value with special meaning (base address selection entry) which can't be used either.
Use -2 instead.
For all other .debug_*, use UINT32_MAX for 32-bit targets and UINT64_MAX
for 64-bit targets. In the code, we intentionally use
`uint64_t tombstone = UINT64_MAX` for 32-bit targets as well: this matches
SignExtend64 as used in `relocateAlloc`. (Actually UINT32_MAX does not work for R_386_32)
Note 0, we only special case `target->symbolicRel` (R_X86_64_64, R_AARCH64_ABS64, R_PPC64_ADDR64), not
short-range absolute relocations (e.g. R_X86_64_32). Only forms like DW_FORM_addr need to be special cased.
They can hold an arbitrary address (must be 64-bit on a 64-bit target). (In theory,
producers can make use of small code model to emit 32-bit relocations. This doesn't seem to be leveraged.)
Note 1, we have to ignore the addend, because we don't want to resolve
DW_AT_low_pc (which may have a non-zero addend) to -1+addend (wrap
around to a low address):
__attribute__((section(".text.x"))) void f1() { }
__attribute__((section(".text.x"))) void f2() { } // DW_AT_low_pc has a non-zero addend
Note 2, if the prevailing copy does not have debugging information while
a non-prevailing copy has (partial debug build), we don't do extra work
to attach debugging information to the prevailing definition. (clang
has a lot of debug info optimizations that are on-by-default that assume
the whole program is built with debug info).
clang -c -ffunction-sections a.cc # prevailing copy has no debug info
clang -c -ffunction-sections -g b.cc
Reviewed By: dblaikie, avl, jhenderson
Differential Revision: https://reviews.llvm.org/D81784
2020-06-24 02:06:39 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
2022-01-13 12:38:26 +08:00
|
|
|
|
|
|
|
// For a relocatable link, only tombstone values are applied.
|
|
|
|
if (config->relocatable)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (expr == R_SIZE) {
|
|
|
|
target.relocateNoSym(bufLoc, type,
|
|
|
|
SignExtend64<bits>(sym.getSize() + addend));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// R_ABS/R_DTPREL and some other relocations can be used from non-SHF_ALLOC
|
|
|
|
// sections.
|
|
|
|
if (expr == R_ABS || expr == R_DTPREL || expr == R_GOTPLTREL ||
|
|
|
|
expr == R_RISCV_ADD) {
|
|
|
|
target.relocateNoSym(bufLoc, type, SignExtend64<bits>(sym.getVA(addend)));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2022-01-19 09:33:58 +08:00
|
|
|
std::string msg = getLocation(offset) + ": has non-ABS relocation " +
|
2022-01-13 12:38:26 +08:00
|
|
|
toString(type) + " against symbol '" + toString(sym) +
|
|
|
|
"'";
|
|
|
|
if (expr != R_PC && expr != R_ARM_PCA) {
|
|
|
|
error(msg);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the control reaches here, we found a PC-relative relocation in a
|
|
|
|
// non-ALLOC section. Since non-ALLOC section is not loaded into memory
|
|
|
|
// at runtime, the notion of PC-relative doesn't make sense here. So,
|
|
|
|
// this is a usage error. However, GNU linkers historically accept such
|
|
|
|
// relocations without any errors and relocate them as if they were at
|
|
|
|
// address 0. For bug-compatibilty, we accept them with warnings. We
|
|
|
|
// know Steel Bank Common Lisp as of 2018 have this bug.
|
|
|
|
warn(msg);
|
|
|
|
target.relocateNoSym(
|
|
|
|
bufLoc, type,
|
|
|
|
SignExtend64<bits>(sym.getVA(addend - offset - outSecOff)));
|
2016-04-29 02:42:04 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-11 20:52:04 +08:00
|
|
|
// This is used when '-r' is given.
|
|
|
|
// For REL targets, InputSection::copyRelocations() may store artificial
|
|
|
|
// relocations aimed to update addends. They are handled in relocateAlloc()
|
|
|
|
// for allocatable sections, and this function does the same for
|
|
|
|
// non-allocatable sections, such as sections with debug information.
|
|
|
|
static void relocateNonAllocForRelocatable(InputSection *sec, uint8_t *buf) {
|
|
|
|
const unsigned bits = config->is64 ? 64 : 32;
|
|
|
|
|
|
|
|
for (const Relocation &rel : sec->relocations) {
|
|
|
|
// InputSection::copyRelocations() adds only R_ABS relocations.
|
|
|
|
assert(rel.expr == R_ABS);
|
2020-08-10 23:57:19 +08:00
|
|
|
uint8_t *bufLoc = buf + rel.offset;
|
2018-07-11 20:52:04 +08:00
|
|
|
uint64_t targetVA = SignExtend64(rel.sym->getVA(rel.addend), bits);
|
2020-01-23 13:39:16 +08:00
|
|
|
target->relocate(bufLoc, rel, targetVA);
|
2018-07-11 20:52:04 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-03-19 02:11:26 +08:00
|
|
|
template <class ELFT>
|
2017-02-23 10:28:28 +08:00
|
|
|
void InputSectionBase::relocate(uint8_t *buf, uint8_t *bufEnd) {
|
2021-12-13 11:26:03 +08:00
|
|
|
if ((flags & SHF_EXECINSTR) && LLVM_UNLIKELY(getFile<ELFT>()->splitStack))
|
2018-07-18 07:16:02 +08:00
|
|
|
adjustSplitStackFunctionPrologues<ELFT>(buf, bufEnd);
|
|
|
|
|
2017-10-07 04:08:51 +08:00
|
|
|
if (flags & SHF_ALLOC) {
|
2017-05-19 00:45:36 +08:00
|
|
|
relocateAlloc(buf, bufEnd);
|
2017-10-07 04:08:51 +08:00
|
|
|
return;
|
|
|
|
}
|
2017-05-19 00:45:36 +08:00
|
|
|
|
2017-10-07 04:08:51 +08:00
|
|
|
auto *sec = cast<InputSection>(this);
|
2022-01-13 12:38:26 +08:00
|
|
|
if (config->relocatable)
|
2018-07-11 20:52:04 +08:00
|
|
|
relocateNonAllocForRelocatable(sec, buf);
|
2022-01-13 12:38:26 +08:00
|
|
|
// For a relocatable link, also call relocateNonAlloc() to rewrite applicable
|
|
|
|
// locations with tombstone values.
|
|
|
|
const RelsOrRelas<ELFT> rels = sec->template relsOrRelas<ELFT>();
|
|
|
|
if (rels.areRelocsRel())
|
|
|
|
sec->relocateNonAlloc<ELFT>(buf, rels.rels);
|
|
|
|
else
|
|
|
|
sec->relocateNonAlloc<ELFT>(buf, rels.relas);
|
2017-05-19 00:45:36 +08:00
|
|
|
}
|
2016-04-29 02:42:04 +08:00
|
|
|
|
2017-05-19 00:45:36 +08:00
|
|
|
void InputSectionBase::relocateAlloc(uint8_t *buf, uint8_t *bufEnd) {
|
|
|
|
assert(flags & SHF_ALLOC);
|
|
|
|
const unsigned bits = config->wordsize * 8;
|
2021-12-25 09:54:12 +08:00
|
|
|
const TargetInfo &target = *elf::target;
|
2020-08-17 22:30:14 +08:00
|
|
|
uint64_t lastPPCRelaxedRelocOff = UINT64_C(-1);
|
2022-01-10 13:20:37 +08:00
|
|
|
AArch64Relaxer aarch64relaxer(relocations);
|
|
|
|
for (size_t i = 0, size = relocations.size(); i != size; ++i) {
|
|
|
|
const Relocation &rel = relocations[i];
|
2020-04-07 21:48:18 +08:00
|
|
|
if (rel.expr == R_NONE)
|
|
|
|
continue;
|
2018-04-19 11:51:26 +08:00
|
|
|
uint64_t offset = rel.offset;
|
2018-04-19 10:24:28 +08:00
|
|
|
uint8_t *bufLoc = buf + offset;
|
2016-04-13 09:40:19 +08:00
|
|
|
|
2022-01-10 13:20:37 +08:00
|
|
|
uint64_t secAddr = getOutputSection()->addr;
|
2020-08-10 23:57:19 +08:00
|
|
|
if (auto *sec = dyn_cast<InputSection>(this))
|
2022-01-10 13:20:37 +08:00
|
|
|
secAddr += sec->outSecOff;
|
|
|
|
const uint64_t addrLoc = secAddr + offset;
|
2021-12-13 11:31:30 +08:00
|
|
|
const uint64_t targetVA =
|
|
|
|
SignExtend64(getRelocTargetVA(file, rel.type, rel.addend, addrLoc,
|
2022-01-10 13:20:37 +08:00
|
|
|
*rel.sym, rel.expr),
|
|
|
|
bits);
|
2021-12-13 11:31:30 +08:00
|
|
|
switch (rel.expr) {
|
2016-05-25 22:31:37 +08:00
|
|
|
case R_RELAX_GOT_PC:
|
[ELF] - Implemented support for test/binop relaxations from latest ABI.
Patch implements next relaxation from latest ABI:
"Convert memory operand of test and binop into immediate operand, where binop is one of adc, add, and, cmp, or,
sbb, sub, xor instructions, when position-independent code is disabled."
It is described in System V Application Binary Interface AMD64 Architecture Processor
Supplement Draft Version 0.99.8 (https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-r249.pdf,
B.2 "B.2 Optimize GOTPCRELX Relocations").
Differential revision: http://reviews.llvm.org/D20793
llvm-svn: 271405
2016-06-02 00:45:30 +08:00
|
|
|
case R_RELAX_GOT_PC_NOPIC:
|
2021-12-25 09:54:12 +08:00
|
|
|
target.relaxGot(bufLoc, rel, targetVA);
|
[PPC64] toc-indirect to toc-relative relaxation
This is based on D54720 by Sean Fertile.
When accessing a global symbol which is not defined in the translation unit,
compilers will generate instructions that load the address from the toc entry.
If the symbol is defined, non-preemptable, and addressable with a 32-bit
signed offset from the toc pointer, the address can be computed
directly. e.g.
addis 3, 2, .LC0@toc@ha # R_PPC64_TOC16_HA
ld 3, .LC0@toc@l(3) # R_PPC64_TOC16_LO_DS, load the address from a .toc entry
ld/lwa 3, 0(3) # load the value from the address
.section .toc,"aw",@progbits
.LC0: .tc var[TC],var
can be relaxed to
addis 3,2,var@toc@ha # this may be relaxed to a nop,
addi 3,3,var@toc@l # then this becomes addi 3,2,var@toc
ld/lwa 3, 0(3) # load the value from the address
We can delete the test ppc64-got-indirect.s as its purpose is covered by
newly added ppc64-toc-relax.s and ppc64-toc-relax-constants.s
Reviewed By: ruiu, sfertile
Differential Revision: https://reviews.llvm.org/D60958
llvm-svn: 360112
2019-05-07 12:26:05 +08:00
|
|
|
break;
|
2022-01-10 13:20:37 +08:00
|
|
|
case R_AARCH64_GOT_PAGE_PC:
|
|
|
|
if (i + 1 < size && aarch64relaxer.tryRelaxAdrpLdr(
|
|
|
|
rel, relocations[i + 1], secAddr, buf)) {
|
|
|
|
++i;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
target.relocate(bufLoc, rel, targetVA);
|
|
|
|
break;
|
2022-02-02 14:08:05 +08:00
|
|
|
case R_AARCH64_PAGE_PC:
|
|
|
|
if (i + 1 < size && aarch64relaxer.tryRelaxAdrpAdd(
|
|
|
|
rel, relocations[i + 1], secAddr, buf)) {
|
|
|
|
++i;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
target.relocate(bufLoc, rel, targetVA);
|
|
|
|
break;
|
2020-08-17 22:30:14 +08:00
|
|
|
case R_PPC64_RELAX_GOT_PC: {
|
|
|
|
// The R_PPC64_PCREL_OPT relocation must appear immediately after
|
|
|
|
// R_PPC64_GOT_PCREL34 in the relocations table at the same offset.
|
|
|
|
// We can only relax R_PPC64_PCREL_OPT if we have also relaxed
|
|
|
|
// the associated R_PPC64_GOT_PCREL34 since only the latter has an
|
|
|
|
// associated symbol. So save the offset when relaxing R_PPC64_GOT_PCREL34
|
|
|
|
// and only relax the other if the saved offset matches.
|
2021-12-13 11:31:30 +08:00
|
|
|
if (rel.type == R_PPC64_GOT_PCREL34)
|
2020-08-17 22:30:14 +08:00
|
|
|
lastPPCRelaxedRelocOff = offset;
|
2021-12-13 11:31:30 +08:00
|
|
|
if (rel.type == R_PPC64_PCREL_OPT && offset != lastPPCRelaxedRelocOff)
|
2020-08-17 22:30:14 +08:00
|
|
|
break;
|
2021-12-25 09:54:12 +08:00
|
|
|
target.relaxGot(bufLoc, rel, targetVA);
|
2020-08-17 22:30:14 +08:00
|
|
|
break;
|
|
|
|
}
|
2020-04-29 21:27:54 +08:00
|
|
|
case R_PPC64_RELAX_TOC:
|
[ELF][PPC64] Suppress toc-indirect to toc-relative relaxation if R_PPC64_TOC16_LO is seen
The current implementation assumes that R_PPC64_TOC16_HA is always followed
by R_PPC64_TOC16_LO_DS. This can break with R_PPC64_TOC16_LO:
// Load the address of the TOC entry, instead of the value stored at that address
addis 3, 2, .LC0@tloc@ha # R_PPC64_TOC16_HA
addi 3, 3, .LC0@tloc@l # R_PPC64_TOC16_LO
blr
which is used by boringssl's util/fipstools/delocate/delocate.go
https://github.com/google/boringssl/blob/master/crypto/fipsmodule/FIPS.md has some documentation.
In short, this tool converts an assembly file to avoid any potential relocations.
The distance to an input .toc is not a constant after linking, so it cannot use an `addis;ld` pair.
Instead, it jumps to a stub which loads the TOC entry address with `addis;addi`.
This patch checks the presence of R_PPC64_TOC16_LO and suppresses
toc-indirect to toc-relative relaxation if R_PPC64_TOC16_LO is seen.
This approach is conservative and loses some relaxation opportunities but is easy to implement.
addis 3, 2, .LC0@toc@ha # no relaxation
addi 3, 3, .LC0@toc@l # no relaxation
li 9, 0
addis 4, 2, .LC0@toc@ha # can relax but suppressed
ld 4, .LC0@toc@l(4) # can relax but suppressed
Also note that interleaved R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS is
possible and this patch accounts for that.
addis 3, 2, .LC1@toc@ha # can relax
addis 4, 2, .LC2@toc@ha # can relax
ld 3, .LC1@toc@l(3) # can relax
ld 4, .LC2@toc@l(4) # can relax
Reviewed By: #powerpc, sfertile
Differential Revision: https://reviews.llvm.org/D78431
2020-04-18 14:08:11 +08:00
|
|
|
// rel.sym refers to the STT_SECTION symbol associated to the .toc input
|
|
|
|
// section. If an R_PPC64_TOC16_LO (.toc + addend) references the TOC
|
|
|
|
// entry, there may be R_PPC64_TOC16_HA not paired with
|
|
|
|
// R_PPC64_TOC16_LO_DS. Don't relax. This loses some relaxation
|
|
|
|
// opportunities but is safe.
|
|
|
|
if (ppc64noTocRelax.count({rel.sym, rel.addend}) ||
|
|
|
|
!tryRelaxPPC64TocIndirection(rel, bufLoc))
|
2021-12-25 09:54:12 +08:00
|
|
|
target.relocate(bufLoc, rel, targetVA);
|
2016-05-25 22:31:37 +08:00
|
|
|
break;
|
2016-05-21 05:14:06 +08:00
|
|
|
case R_RELAX_TLS_IE_TO_LE:
|
2021-12-25 09:54:12 +08:00
|
|
|
target.relaxTlsIeToLe(bufLoc, rel, targetVA);
|
2016-05-21 05:14:06 +08:00
|
|
|
break;
|
|
|
|
case R_RELAX_TLS_LD_TO_LE:
|
2018-07-10 00:35:51 +08:00
|
|
|
case R_RELAX_TLS_LD_TO_LE_ABS:
|
2021-12-25 09:54:12 +08:00
|
|
|
target.relaxTlsLdToLe(bufLoc, rel, targetVA);
|
2016-05-21 05:14:06 +08:00
|
|
|
break;
|
|
|
|
case R_RELAX_TLS_GD_TO_LE:
|
2016-06-05 07:22:34 +08:00
|
|
|
case R_RELAX_TLS_GD_TO_LE_NEG:
|
2021-12-25 09:54:12 +08:00
|
|
|
target.relaxTlsGdToLe(bufLoc, rel, targetVA);
|
2016-05-21 05:14:06 +08:00
|
|
|
break;
|
2018-11-13 18:16:36 +08:00
|
|
|
case R_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC:
|
2016-05-21 05:14:06 +08:00
|
|
|
case R_RELAX_TLS_GD_TO_IE:
|
2016-06-05 07:33:31 +08:00
|
|
|
case R_RELAX_TLS_GD_TO_IE_ABS:
|
[PPC64] Thread-local storage general-dynamic to initial-exec relaxation.
Patch adds support for relaxing the general-dynamic tls sequence to
initial-exec.
the relaxation performs the following transformation:
addis r3, r2, x@got@tlsgd@ha --> addis r3, r2, x@got@tprel@ha
addi r3, r3, x@got@tlsgd@l --> ld r3, x@got@tprel@l(r3)
bl __tls_get_addr(x@tlsgd) --> nop
nop --> add r3, r3, r13
and instead of emitting a DTPMOD64/DTPREL64 pair for x, we emit a single
R_PPC64_TPREL64.
Differential Revision: https://reviews.llvm.org/D48090
llvm-svn: 335651
2018-06-27 03:38:18 +08:00
|
|
|
case R_RELAX_TLS_GD_TO_IE_GOT_OFF:
|
2019-04-22 10:48:37 +08:00
|
|
|
case R_RELAX_TLS_GD_TO_IE_GOTPLT:
|
2021-12-25 09:54:12 +08:00
|
|
|
target.relaxTlsGdToIe(bufLoc, rel, targetVA);
|
2016-05-21 05:14:06 +08:00
|
|
|
break;
|
2019-06-03 14:21:33 +08:00
|
|
|
case R_PPC64_CALL:
|
[PPC64] Thread-local storage general-dynamic to initial-exec relaxation.
Patch adds support for relaxing the general-dynamic tls sequence to
initial-exec.
the relaxation performs the following transformation:
addis r3, r2, x@got@tlsgd@ha --> addis r3, r2, x@got@tprel@ha
addi r3, r3, x@got@tlsgd@l --> ld r3, x@got@tprel@l(r3)
bl __tls_get_addr(x@tlsgd) --> nop
nop --> add r3, r3, r13
and instead of emitting a DTPMOD64/DTPREL64 pair for x, we emit a single
R_PPC64_TPREL64.
Differential Revision: https://reviews.llvm.org/D48090
llvm-svn: 335651
2018-06-27 03:38:18 +08:00
|
|
|
// If this is a call to __tls_get_addr, it may be part of a TLS
|
|
|
|
// sequence that has been relaxed and turned into a nop. In this
|
|
|
|
// case, we don't want to handle it as a call.
|
|
|
|
if (read32(bufLoc) == 0x60000000) // nop
|
|
|
|
break;
|
|
|
|
|
2016-05-24 20:17:11 +08:00
|
|
|
// Patch a nop (0x60000000) to a ld.
|
2018-05-07 03:13:29 +08:00
|
|
|
if (rel.sym->needsTocRestore) {
|
2019-12-18 08:45:04 +08:00
|
|
|
// gcc/gfortran 5.4, 6.3 and earlier versions do not add nop for
|
|
|
|
// recursive calls even if the function is preemptible. This is not
|
|
|
|
// wrong in the common case where the function is not preempted at
|
|
|
|
// runtime. Just ignore.
|
|
|
|
if ((bufLoc + 8 > bufEnd || read32(bufLoc + 4) != 0x60000000) &&
|
|
|
|
rel.sym->file != file) {
|
|
|
|
// Use substr(6) to remove the "__plt_" prefix.
|
|
|
|
errorOrWarn(getErrorLocation(bufLoc) + "call to " +
|
|
|
|
lld::toString(*rel.sym).substr(6) +
|
|
|
|
" lacks nop, can't restore toc");
|
2018-05-07 03:13:29 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
write32(bufLoc + 4, 0xe8410018); // ld %r2, 24(%r1)
|
2018-04-23 23:01:24 +08:00
|
|
|
}
|
2021-12-25 09:54:12 +08:00
|
|
|
target.relocate(bufLoc, rel, targetVA);
|
2018-04-24 04:34:35 +08:00
|
|
|
break;
|
2016-05-21 05:14:06 +08:00
|
|
|
default:
|
2021-12-25 09:54:12 +08:00
|
|
|
target.relocate(bufLoc, rel, targetVA);
|
2016-05-21 05:14:06 +08:00
|
|
|
break;
|
|
|
|
}
|
2015-09-22 06:01:00 +08:00
|
|
|
}
|
2020-04-07 21:48:18 +08:00
|
|
|
|
|
|
|
// Apply jumpInstrMods. jumpInstrMods are created when the opcode of
|
|
|
|
// a jmp insn must be modified to shrink the jmp insn or to flip the jmp
|
|
|
|
// insn. This is primarily used to relax and optimize jumps created with
|
|
|
|
// basic block sections.
|
2021-12-27 14:17:30 +08:00
|
|
|
if (jumpInstrMod) {
|
|
|
|
target.applyJumpInstrMod(buf + jumpInstrMod->offset, jumpInstrMod->original,
|
|
|
|
jumpInstrMod->size);
|
2020-04-07 21:48:18 +08:00
|
|
|
}
|
2015-09-22 06:01:00 +08:00
|
|
|
}
|
|
|
|
|
2018-07-18 07:16:02 +08:00
|
|
|
// For each function-defining prologue, find any calls to __morestack,
|
|
|
|
// and replace them with calls to __morestack_non_split.
|
|
|
|
static void switchMorestackCallsToMorestackNonSplit(
|
2022-02-01 16:09:30 +08:00
|
|
|
DenseSet<Defined *> &prologues,
|
|
|
|
SmallVector<Relocation *, 0> &morestackCalls) {
|
2018-07-18 07:16:02 +08:00
|
|
|
|
|
|
|
// If the target adjusted a function's prologue, all calls to
|
|
|
|
// __morestack inside that function should be switched to
|
|
|
|
// __morestack_non_split.
|
|
|
|
Symbol *moreStackNonSplit = symtab->find("__morestack_non_split");
|
2018-08-03 02:13:40 +08:00
|
|
|
if (!moreStackNonSplit) {
|
2022-01-16 02:46:25 +08:00
|
|
|
error("mixing split-stack objects requires a definition of "
|
2018-08-03 02:13:40 +08:00
|
|
|
"__morestack_non_split");
|
|
|
|
return;
|
|
|
|
}
|
2018-07-18 07:16:02 +08:00
|
|
|
|
|
|
|
// Sort both collections to compare addresses efficiently.
|
2018-09-27 04:54:42 +08:00
|
|
|
llvm::sort(morestackCalls, [](const Relocation *l, const Relocation *r) {
|
|
|
|
return l->offset < r->offset;
|
|
|
|
});
|
2018-07-18 07:16:02 +08:00
|
|
|
std::vector<Defined *> functions(prologues.begin(), prologues.end());
|
2018-09-27 04:54:42 +08:00
|
|
|
llvm::sort(functions, [](const Defined *l, const Defined *r) {
|
|
|
|
return l->value < r->value;
|
|
|
|
});
|
2018-07-18 07:16:02 +08:00
|
|
|
|
|
|
|
auto it = morestackCalls.begin();
|
|
|
|
for (Defined *f : functions) {
|
|
|
|
// Find the first call to __morestack within the function.
|
|
|
|
while (it != morestackCalls.end() && (*it)->offset < f->value)
|
|
|
|
++it;
|
|
|
|
// Adjust all calls inside the function.
|
|
|
|
while (it != morestackCalls.end() && (*it)->offset < f->value + f->size) {
|
|
|
|
(*it)->sym = moreStackNonSplit;
|
|
|
|
++it;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-03 02:13:40 +08:00
|
|
|
static bool enclosingPrologueAttempted(uint64_t offset,
|
|
|
|
const DenseSet<Defined *> &prologues) {
|
2018-07-18 07:16:02 +08:00
|
|
|
for (Defined *f : prologues)
|
|
|
|
if (f->value <= offset && offset < f->value + f->size)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If a function compiled for split stack calls a function not
|
|
|
|
// compiled for split stack, then the caller needs its prologue
|
|
|
|
// adjusted to ensure that the called function will have enough stack
|
|
|
|
// available. Find those functions, and adjust their prologues.
|
|
|
|
template <class ELFT>
|
|
|
|
void InputSectionBase::adjustSplitStackFunctionPrologues(uint8_t *buf,
|
|
|
|
uint8_t *end) {
|
2018-08-03 02:13:40 +08:00
|
|
|
DenseSet<Defined *> prologues;
|
2022-02-01 16:09:30 +08:00
|
|
|
SmallVector<Relocation *, 0> morestackCalls;
|
2018-07-18 07:16:02 +08:00
|
|
|
|
|
|
|
for (Relocation &rel : relocations) {
|
|
|
|
// Ignore calls into the split-stack api.
|
2018-08-14 06:29:15 +08:00
|
|
|
if (rel.sym->getName().startswith("__morestack")) {
|
|
|
|
if (rel.sym->getName().equals("__morestack"))
|
2018-07-18 07:16:02 +08:00
|
|
|
morestackCalls.push_back(&rel);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// A relocation to non-function isn't relevant. Sometimes
|
|
|
|
// __morestack is not marked as a function, so this check comes
|
|
|
|
// after the name check.
|
2018-08-14 06:29:15 +08:00
|
|
|
if (rel.sym->type != STT_FUNC)
|
2018-07-18 07:16:02 +08:00
|
|
|
continue;
|
|
|
|
|
2018-08-14 06:29:15 +08:00
|
|
|
// If the callee's-file was compiled with split stack, nothing to do. In
|
|
|
|
// this context, a "Defined" symbol is one "defined by the binary currently
|
|
|
|
// being produced". So an "undefined" symbol might be provided by a shared
|
|
|
|
// library. It is not possible to tell how such symbols were compiled, so be
|
|
|
|
// conservative.
|
|
|
|
if (Defined *d = dyn_cast<Defined>(rel.sym))
|
|
|
|
if (InputSection *isec = cast_or_null<InputSection>(d->section))
|
2018-09-05 05:06:59 +08:00
|
|
|
if (!isec || !isec->getFile<ELFT>() || isec->getFile<ELFT>()->splitStack)
|
2018-08-14 06:29:15 +08:00
|
|
|
continue;
|
2018-08-03 02:13:40 +08:00
|
|
|
|
|
|
|
if (enclosingPrologueAttempted(rel.offset, prologues))
|
2018-07-18 07:16:02 +08:00
|
|
|
continue;
|
|
|
|
|
2021-12-27 07:21:22 +08:00
|
|
|
if (Defined *f = getEnclosingFunction(rel.offset)) {
|
2018-08-03 02:13:40 +08:00
|
|
|
prologues.insert(f);
|
2020-08-10 23:57:19 +08:00
|
|
|
if (target->adjustPrologueForCrossSplitStack(buf + f->value, end,
|
|
|
|
f->stOther))
|
2018-07-18 07:16:02 +08:00
|
|
|
continue;
|
2018-08-03 02:13:40 +08:00
|
|
|
if (!getFile<ELFT>()->someNoSplitStack)
|
2020-05-15 13:18:58 +08:00
|
|
|
error(lld::toString(this) + ": " + f->getName() +
|
2018-08-14 06:29:15 +08:00
|
|
|
" (with -fsplit-stack) calls " + rel.sym->getName() +
|
2018-08-03 02:13:40 +08:00
|
|
|
" (without -fsplit-stack), but couldn't adjust its prologue");
|
2018-07-18 07:16:02 +08:00
|
|
|
}
|
|
|
|
}
|
2018-10-17 01:13:01 +08:00
|
|
|
|
|
|
|
if (target->needsMoreStackNonSplit)
|
|
|
|
switchMorestackCallsToMorestackNonSplit(prologues, morestackCalls);
|
2018-07-18 07:16:02 +08:00
|
|
|
}
|
|
|
|
|
2017-02-24 00:49:07 +08:00
|
|
|
template <class ELFT> void InputSection::writeTo(uint8_t *buf) {
|
2017-02-27 10:56:02 +08:00
|
|
|
if (auto *s = dyn_cast<SyntheticSection>(this)) {
|
2021-12-27 04:11:40 +08:00
|
|
|
s->writeTo(buf);
|
2016-11-16 18:02:27 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-12-17 15:42:50 +08:00
|
|
|
if (LLVM_UNLIKELY(type == SHT_NOBITS))
|
|
|
|
return;
|
2017-02-09 00:18:10 +08:00
|
|
|
// If -r or --emit-relocs is given, then an InputSection
|
|
|
|
// may be a relocation section.
|
2021-12-17 15:42:50 +08:00
|
|
|
if (LLVM_UNLIKELY(type == SHT_RELA)) {
|
2021-12-27 04:11:40 +08:00
|
|
|
copyRelocations<ELFT>(buf, getDataAs<typename ELFT::Rela>());
|
2016-02-25 16:23:37 +08:00
|
|
|
return;
|
|
|
|
}
|
2021-12-17 15:42:50 +08:00
|
|
|
if (LLVM_UNLIKELY(type == SHT_REL)) {
|
2021-12-27 04:11:40 +08:00
|
|
|
copyRelocations<ELFT>(buf, getDataAs<typename ELFT::Rel>());
|
2016-02-25 16:23:37 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2017-06-09 11:19:08 +08:00
|
|
|
// If -r is given, we may have a SHT_GROUP section.
|
2021-12-17 15:42:50 +08:00
|
|
|
if (LLVM_UNLIKELY(type == SHT_GROUP)) {
|
2021-12-27 04:11:40 +08:00
|
|
|
copyShtGroup<ELFT>(buf);
|
2017-05-29 16:37:50 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
// If this is a compressed section, uncompress section contents directly
|
|
|
|
// to the buffer.
|
2019-03-13 04:32:30 +08:00
|
|
|
if (uncompressedSize >= 0) {
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
size_t size = uncompressedSize;
|
2021-12-27 04:11:40 +08:00
|
|
|
if (Error e = zlib::uncompress(toStringRef(rawData), (char *)buf, size))
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
fatal(toString(this) +
|
|
|
|
": uncompress failed: " + llvm::toString(std::move(e)));
|
2021-12-27 04:11:40 +08:00
|
|
|
uint8_t *bufEnd = buf + size;
|
|
|
|
relocate<ELFT>(buf, bufEnd);
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2017-04-14 09:35:04 +08:00
|
|
|
// Copy section contents from source object file to output file
|
|
|
|
// and then apply relocations.
|
2022-01-27 14:03:26 +08:00
|
|
|
memcpy(buf, rawData.data(), rawData.size());
|
|
|
|
relocate<ELFT>(buf, buf + rawData.size());
|
2015-07-25 05:03:07 +08:00
|
|
|
}
|
|
|
|
|
2017-02-24 00:49:07 +08:00
|
|
|
void InputSection::replace(InputSection *other) {
|
2017-12-20 00:29:02 +08:00
|
|
|
alignment = std::max(alignment, other->alignment);
|
2019-05-29 11:55:20 +08:00
|
|
|
|
|
|
|
// When a section is replaced with another section that was allocated to
|
|
|
|
// another partition, the replacement section (and its associated sections)
|
|
|
|
// need to be placed in the main partition so that both partitions will be
|
|
|
|
// able to access it.
|
|
|
|
if (partition != other->partition) {
|
|
|
|
partition = 1;
|
|
|
|
for (InputSection *isec : dependentSections)
|
|
|
|
isec->partition = 1;
|
|
|
|
}
|
|
|
|
|
2017-12-20 00:29:02 +08:00
|
|
|
other->repl = repl;
|
2019-05-29 11:55:20 +08:00
|
|
|
other->markDead();
|
2016-02-26 02:43:51 +08:00
|
|
|
}
|
|
|
|
|
2015-11-12 03:54:14 +08:00
|
|
|
template <class ELFT>
|
2017-12-21 10:03:39 +08:00
|
|
|
EhInputSection::EhInputSection(ObjFile<ELFT> &f,
|
|
|
|
const typename ELFT::Shdr &header,
|
2017-03-07 05:17:18 +08:00
|
|
|
StringRef name)
|
2017-11-30 22:01:06 +08:00
|
|
|
: InputSectionBase(f, header, name, InputSectionBase::EHFrame) {}
|
2015-11-12 03:54:14 +08:00
|
|
|
|
2017-06-01 04:17:44 +08:00
|
|
|
SyntheticSection *EhInputSection::getParent() const {
|
|
|
|
return cast_or_null<SyntheticSection>(parent);
|
|
|
|
}
|
|
|
|
|
2016-07-22 04:18:30 +08:00
|
|
|
// Returns the index of the first relocation that points to a region between
|
|
|
|
// Begin and Begin+Size.
|
|
|
|
template <class IntTy, class RelTy>
|
|
|
|
static unsigned getReloc(IntTy begin, IntTy size, const ArrayRef<RelTy> &rels,
|
|
|
|
unsigned &relocI) {
|
|
|
|
// Start search from RelocI for fast access. That works because the
|
|
|
|
// relocations are sorted in .eh_frame.
|
|
|
|
for (unsigned n = rels.size(); relocI < n; ++relocI) {
|
|
|
|
const RelTy &rel = rels[relocI];
|
|
|
|
if (rel.r_offset < begin)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (rel.r_offset < begin + size)
|
|
|
|
return relocI;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2016-05-23 07:53:00 +08:00
|
|
|
// .eh_frame is a sequence of CIE or FDE records.
|
|
|
|
// This function splits an input section into records and returns them.
|
2017-03-07 05:17:18 +08:00
|
|
|
template <class ELFT> void EhInputSection::split() {
|
2021-10-28 00:51:06 +08:00
|
|
|
const RelsOrRelas<ELFT> rels = relsOrRelas<ELFT>();
|
2022-01-19 09:03:23 +08:00
|
|
|
// getReloc expects the relocations to be sorted by r_offset. See the comment
|
|
|
|
// in scanRelocs.
|
|
|
|
if (rels.areRelocsRel()) {
|
|
|
|
SmallVector<typename ELFT::Rel, 0> storage;
|
|
|
|
split<ELFT>(sortRels(rels.rels, storage));
|
|
|
|
} else {
|
|
|
|
SmallVector<typename ELFT::Rela, 0> storage;
|
|
|
|
split<ELFT>(sortRels(rels.relas, storage));
|
|
|
|
}
|
2016-07-22 04:18:30 +08:00
|
|
|
}
|
|
|
|
|
2017-03-07 05:17:18 +08:00
|
|
|
template <class ELFT, class RelTy>
|
|
|
|
void EhInputSection::split(ArrayRef<RelTy> rels) {
|
2022-01-19 09:03:23 +08:00
|
|
|
ArrayRef<uint8_t> d = rawData;
|
|
|
|
const char *msg = nullptr;
|
2016-07-22 04:18:30 +08:00
|
|
|
unsigned relI = 0;
|
2022-01-19 09:03:23 +08:00
|
|
|
while (!d.empty()) {
|
|
|
|
if (d.size() < 4) {
|
|
|
|
msg = "CIE/FDE too small";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
uint64_t size = endian::read32<ELFT::TargetEndianness>(d.data());
|
|
|
|
// If it is 0xFFFFFFFF, the next 8 bytes contain the size instead,
|
|
|
|
// but we do not support that format yet.
|
|
|
|
if (size == UINT32_MAX) {
|
|
|
|
msg = "CIE/FDE too large";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
size += 4;
|
|
|
|
if (size > d.size()) {
|
|
|
|
msg = "CIE/FDE ends past the end of the section";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t off = d.data() - rawData.data();
|
2017-12-20 00:29:02 +08:00
|
|
|
pieces.emplace_back(off, this, size, getReloc(off, size, rels, relI));
|
2022-01-19 09:03:23 +08:00
|
|
|
d = d.slice(size);
|
2016-05-23 07:53:00 +08:00
|
|
|
}
|
2022-01-19 09:03:23 +08:00
|
|
|
if (msg)
|
|
|
|
errorOrWarn("corrupted .eh_frame: " + Twine(msg) + "\n>>> defined in " +
|
|
|
|
getObjMsg(d.data() - rawData.data()));
|
2016-05-23 07:53:00 +08:00
|
|
|
}
|
|
|
|
|
2017-10-22 09:58:30 +08:00
|
|
|
static size_t findNull(StringRef s, size_t entSize) {
|
2016-04-23 06:09:35 +08:00
|
|
|
for (unsigned i = 0, n = s.size(); i != n; i += entSize) {
|
|
|
|
const char *b = s.begin() + i;
|
|
|
|
if (std::all_of(b, b + entSize, [](char c) { return c == 0; }))
|
|
|
|
return i;
|
|
|
|
}
|
2022-01-31 09:15:45 +08:00
|
|
|
llvm_unreachable("");
|
2016-04-23 06:09:35 +08:00
|
|
|
}
|
|
|
|
|
2017-06-01 04:17:44 +08:00
|
|
|
SyntheticSection *MergeInputSection::getParent() const {
|
|
|
|
return cast_or_null<SyntheticSection>(parent);
|
|
|
|
}
|
|
|
|
|
2016-05-23 08:40:24 +08:00
|
|
|
// Split SHF_STRINGS section. Such section is a sequence of
|
|
|
|
// null-terminated strings.
|
2022-01-31 09:15:45 +08:00
|
|
|
void MergeInputSection::splitStrings(StringRef s, size_t entSize) {
|
2021-12-17 13:17:02 +08:00
|
|
|
const bool live = !(flags & SHF_ALLOC) || !config->gcSections;
|
2022-01-31 09:15:45 +08:00
|
|
|
const char *p = s.data(), *end = s.data() + s.size();
|
|
|
|
if (!std::all_of(end - entSize, end, [](char c) { return c == 0; }))
|
|
|
|
fatal(toString(this) + ": string is not null terminated");
|
|
|
|
if (entSize == 1) {
|
|
|
|
// Optimize the common case.
|
|
|
|
do {
|
|
|
|
size_t size = strlen(p) + 1;
|
|
|
|
pieces.emplace_back(p - s.begin(), xxHash64(StringRef(p, size)), live);
|
|
|
|
p += size;
|
|
|
|
} while (p != end);
|
|
|
|
} else {
|
|
|
|
do {
|
|
|
|
size_t size = findNull(StringRef(p, end - p), entSize) + entSize;
|
|
|
|
pieces.emplace_back(p - s.begin(), xxHash64(StringRef(p, size)), live);
|
|
|
|
p += size;
|
|
|
|
} while (p != end);
|
2016-04-23 06:09:35 +08:00
|
|
|
}
|
2016-05-23 08:40:24 +08:00
|
|
|
}
|
2016-04-23 06:09:35 +08:00
|
|
|
|
2016-05-23 08:40:24 +08:00
|
|
|
// Split non-SHF_STRINGS section. Such section is a sequence of
|
|
|
|
// fixed size records.
|
2017-03-07 04:23:56 +08:00
|
|
|
void MergeInputSection::splitNonStrings(ArrayRef<uint8_t> data,
|
|
|
|
size_t entSize) {
|
2016-04-23 06:09:35 +08:00
|
|
|
size_t size = data.size();
|
|
|
|
assert((size % entSize) == 0);
|
2021-12-17 13:17:02 +08:00
|
|
|
const bool live = !(flags & SHF_ALLOC) || !config->gcSections;
|
[Coding style change] Rename variables so that they start with a lowercase letter
This patch is mechanically generated by clang-llvm-rename tool that I wrote
using Clang Refactoring Engine just for creating this patch. You can see the
source code of the tool at https://reviews.llvm.org/D64123. There's no manual
post-processing; you can generate the same patch by re-running the tool against
lld's code base.
Here is the main discussion thread to change the LLVM coding style:
https://lists.llvm.org/pipermail/llvm-dev/2019-February/130083.html
In the discussion thread, I proposed we use lld as a testbed for variable
naming scheme change, and this patch does that.
I chose to rename variables so that they are in camelCase, just because that
is a minimal change to make variables to start with a lowercase letter.
Note to downstream patch maintainers: if you are maintaining a downstream lld
repo, just rebasing ahead of this commit would cause massive merge conflicts
because this patch essentially changes every line in the lld subdirectory. But
there's a remedy.
clang-llvm-rename tool is a batch tool, so you can rename variables in your
downstream repo with the tool. Given that, here is how to rebase your repo to
a commit after the mass renaming:
1. rebase to the commit just before the mass variable renaming,
2. apply the tool to your downstream repo to mass-rename variables locally, and
3. rebase again to the head.
Most changes made by the tool should be identical for a downstream repo and
for the head, so at the step 3, almost all changes should be merged and
disappear. I'd expect that there would be some lines that you need to merge by
hand, but that shouldn't be too many.
Differential Revision: https://reviews.llvm.org/D64121
llvm-svn: 365595
2019-07-10 13:00:37 +08:00
|
|
|
|
2022-01-30 16:10:52 +08:00
|
|
|
pieces.resize_for_overwrite(size / entSize);
|
2021-12-17 13:22:59 +08:00
|
|
|
for (size_t i = 0, j = 0; i != size; i += entSize, j++)
|
|
|
|
pieces[j] = {i, (uint32_t)xxHash64(data.slice(i, entSize)), live};
|
2016-05-23 08:40:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
template <class ELFT>
|
2017-12-21 10:03:39 +08:00
|
|
|
MergeInputSection::MergeInputSection(ObjFile<ELFT> &f,
|
|
|
|
const typename ELFT::Shdr &header,
|
2017-03-07 04:23:56 +08:00
|
|
|
StringRef name)
|
2017-12-21 09:21:59 +08:00
|
|
|
: InputSectionBase(f, header, name, InputSectionBase::Merge) {}
|
|
|
|
|
|
|
|
MergeInputSection::MergeInputSection(uint64_t flags, uint32_t type,
|
|
|
|
uint64_t entsize, ArrayRef<uint8_t> data,
|
|
|
|
StringRef name)
|
|
|
|
: InputSectionBase(nullptr, flags, type, entsize, /*Link*/ 0, /*Info*/ 0,
|
|
|
|
/*Alignment*/ entsize, data, name, SectionBase::Merge) {}
|
2016-05-24 00:55:43 +08:00
|
|
|
|
Parallelize uncompress() and splitIntoPieces().
Uncompressing section contents and spliting mergeable section contents
into smaller chunks are heavy tasks. They scan entire section contents
and do CPU-intensive tasks such as uncompressing zlib-compressed data
or computing a hash value for each section piece.
Luckily, these tasks are independent to each other, so we can do that
in parallel_for_each. The number of input sections is large (as opposed
to the number of output sections), so there's a large parallelism here.
Actually the current design to call uncompress() and splitIntoPieces()
in batch was chosen with doing this in mind. Basically what we need to
do here is to replace `for` with `parallel_for_each`.
It seems this patch improves latency significantly if linked programs
contain debug info (which in turn contain lots of mergeable strings.)
For example, the latency to link Clang (debug build) improved by 20% on
my machine as shown below. Note that ld.gold took 19.2 seconds to do
the same thing.
Before:
30801.782712 task-clock (msec) # 3.652 CPUs utilized ( +- 2.59% )
104,084 context-switches # 0.003 M/sec ( +- 1.02% )
5,063 cpu-migrations # 0.164 K/sec ( +- 13.66% )
2,528,130 page-faults # 0.082 M/sec ( +- 0.47% )
85,317,809,130 cycles # 2.770 GHz ( +- 2.62% )
67,352,463,373 stalled-cycles-frontend # 78.94% frontend cycles idle ( +- 3.06% )
<not supported> stalled-cycles-backend
44,295,945,493 instructions # 0.52 insns per cycle
# 1.52 stalled cycles per insn ( +- 0.44% )
8,572,384,877 branches # 278.308 M/sec ( +- 0.66% )
141,806,726 branch-misses # 1.65% of all branches ( +- 0.13% )
8.433424003 seconds time elapsed ( +- 1.20% )
After:
35523.764575 task-clock (msec) # 5.265 CPUs utilized ( +- 2.67% )
159,107 context-switches # 0.004 M/sec ( +- 0.48% )
8,123 cpu-migrations # 0.229 K/sec ( +- 23.34% )
2,372,483 page-faults # 0.067 M/sec ( +- 0.36% )
98,395,342,152 cycles # 2.770 GHz ( +- 2.62% )
79,294,670,125 stalled-cycles-frontend # 80.59% frontend cycles idle ( +- 3.03% )
<not supported> stalled-cycles-backend
46,274,151,813 instructions # 0.47 insns per cycle
# 1.71 stalled cycles per insn ( +- 0.47% )
8,987,621,670 branches # 253.003 M/sec ( +- 0.60% )
148,900,624 branch-misses # 1.66% of all branches ( +- 0.27% )
6.747548004 seconds time elapsed ( +- 0.40% )
llvm-svn: 287946
2016-11-26 04:05:08 +08:00
|
|
|
// This function is called after we obtain a complete list of input sections
|
|
|
|
// that need to be linked. This is responsible to split section contents
|
|
|
|
// into small chunks for further processing.
|
|
|
|
//
|
2017-08-24 03:03:20 +08:00
|
|
|
// Note that this function is called from parallelForEach. This must be
|
Parallelize uncompress() and splitIntoPieces().
Uncompressing section contents and spliting mergeable section contents
into smaller chunks are heavy tasks. They scan entire section contents
and do CPU-intensive tasks such as uncompressing zlib-compressed data
or computing a hash value for each section piece.
Luckily, these tasks are independent to each other, so we can do that
in parallel_for_each. The number of input sections is large (as opposed
to the number of output sections), so there's a large parallelism here.
Actually the current design to call uncompress() and splitIntoPieces()
in batch was chosen with doing this in mind. Basically what we need to
do here is to replace `for` with `parallel_for_each`.
It seems this patch improves latency significantly if linked programs
contain debug info (which in turn contain lots of mergeable strings.)
For example, the latency to link Clang (debug build) improved by 20% on
my machine as shown below. Note that ld.gold took 19.2 seconds to do
the same thing.
Before:
30801.782712 task-clock (msec) # 3.652 CPUs utilized ( +- 2.59% )
104,084 context-switches # 0.003 M/sec ( +- 1.02% )
5,063 cpu-migrations # 0.164 K/sec ( +- 13.66% )
2,528,130 page-faults # 0.082 M/sec ( +- 0.47% )
85,317,809,130 cycles # 2.770 GHz ( +- 2.62% )
67,352,463,373 stalled-cycles-frontend # 78.94% frontend cycles idle ( +- 3.06% )
<not supported> stalled-cycles-backend
44,295,945,493 instructions # 0.52 insns per cycle
# 1.52 stalled cycles per insn ( +- 0.44% )
8,572,384,877 branches # 278.308 M/sec ( +- 0.66% )
141,806,726 branch-misses # 1.65% of all branches ( +- 0.13% )
8.433424003 seconds time elapsed ( +- 1.20% )
After:
35523.764575 task-clock (msec) # 5.265 CPUs utilized ( +- 2.67% )
159,107 context-switches # 0.004 M/sec ( +- 0.48% )
8,123 cpu-migrations # 0.229 K/sec ( +- 23.34% )
2,372,483 page-faults # 0.067 M/sec ( +- 0.36% )
98,395,342,152 cycles # 2.770 GHz ( +- 2.62% )
79,294,670,125 stalled-cycles-frontend # 80.59% frontend cycles idle ( +- 3.03% )
<not supported> stalled-cycles-backend
46,274,151,813 instructions # 0.47 insns per cycle
# 1.71 stalled cycles per insn ( +- 0.47% )
8,987,621,670 branches # 253.003 M/sec ( +- 0.60% )
148,900,624 branch-misses # 1.66% of all branches ( +- 0.27% )
6.747548004 seconds time elapsed ( +- 0.40% )
llvm-svn: 287946
2016-11-26 04:05:08 +08:00
|
|
|
// thread-safe (i.e. no memory allocation from the pools).
|
2017-03-07 04:23:56 +08:00
|
|
|
void MergeInputSection::splitIntoPieces() {
|
2017-09-01 20:04:52 +08:00
|
|
|
assert(pieces.empty());
|
2017-10-22 09:58:30 +08:00
|
|
|
|
2017-12-20 00:29:02 +08:00
|
|
|
if (flags & SHF_STRINGS)
|
2022-01-31 09:15:45 +08:00
|
|
|
splitStrings(toStringRef(data()), entsize);
|
2016-05-23 08:40:24 +08:00
|
|
|
else
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
splitNonStrings(data(), entsize);
|
2016-04-23 06:09:35 +08:00
|
|
|
}
|
2015-10-20 05:00:02 +08:00
|
|
|
|
2018-04-03 12:06:14 +08:00
|
|
|
SectionPiece *MergeInputSection::getSectionPiece(uint64_t offset) {
|
Avoid unnecessary buffer allocation and memcpy for compressed sections.
Previously, we uncompress all compressed sections before doing anything.
That works, and that is conceptually simple, but that could results in
a waste of CPU time and memory if uncompressed sections are then
discarded or just copied to the output buffer.
In particular, if .debug_gnu_pub{names,types} are compressed and if no
-gdb-index option is given, we wasted CPU and memory because we
uncompress them into newly allocated bufers and then memcpy the buffers
to the output buffer. That temporary buffer was redundant.
This patch changes how to uncompress sections. Now, compressed sections
are uncompressed lazily. To do that, `Data` member of `InputSectionBase`
is now hidden from outside, and `data()` accessor automatically expands
an compressed buffer if necessary.
If no one calls `data()`, then `writeTo()` directly uncompresses
compressed data into the output buffer. That eliminates the redundant
memory allocation and redundant memcpy.
This patch significantly reduces memory consumption (20 GiB max RSS to
15 Gib) for an executable whose .debug_gnu_pub{names,types} are in total
5 GiB in an uncompressed form.
Differential Revision: https://reviews.llvm.org/D52917
llvm-svn: 343979
2018-10-09 00:58:59 +08:00
|
|
|
if (this->data().size() <= offset)
|
2018-08-31 19:51:51 +08:00
|
|
|
fatal(toString(this) + ": offset is outside the section");
|
|
|
|
|
2018-04-03 12:06:14 +08:00
|
|
|
// If Offset is not at beginning of a section piece, it is not in the map.
|
2018-08-31 19:51:51 +08:00
|
|
|
// In that case we need to do a binary search of the original section piece vector.
|
2019-06-30 19:19:56 +08:00
|
|
|
auto it = partition_point(
|
|
|
|
pieces, [=](SectionPiece p) { return p.inputOff <= offset; });
|
2019-04-17 16:00:46 +08:00
|
|
|
return &it[-1];
|
2018-04-03 12:06:14 +08:00
|
|
|
}
|
|
|
|
|
Avoid doing binary search.
MergedInputSection::getOffset is the busiest function in LLD if string
merging is enabled and input files have lots of mergeable sections.
It is usually the case when creating executable with debug info,
so it is pretty common.
The reason why it is slow is because it has to do faily complex
computations. For non-mergeable sections, section contents are
contiguous in output, so in order to compute an output offset,
we only have to add the output section's base address to an input
offset. But for mergeable strings, section contents are split for
merging, so they are not contigous. We've got to do some lookups.
We used to do binary search on the list of section pieces.
It is slow because I think it's hostile to branch prediction.
This patch replaces it with hash table lookup. Seems it's working
pretty well. Below is "perf stat -r10" output when linking clang
with debug info. In this case this patch speeds up about 4%.
Before:
6584.153205 task-clock (msec) # 1.001 CPUs utilized ( +- 0.09% )
238 context-switches # 0.036 K/sec ( +- 6.59% )
0 cpu-migrations # 0.000 K/sec ( +- 50.92% )
1,067,675 page-faults # 0.162 M/sec ( +- 0.15% )
18,369,931,470 cycles # 2.790 GHz ( +- 0.09% )
9,640,680,143 stalled-cycles-frontend # 52.48% frontend cycles idle ( +- 0.18% )
<not supported> stalled-cycles-backend
21,206,747,787 instructions # 1.15 insns per cycle
# 0.45 stalled cycles per insn ( +- 0.04% )
3,817,398,032 branches # 579.786 M/sec ( +- 0.04% )
132,787,249 branch-misses # 3.48% of all branches ( +- 0.02% )
6.579106511 seconds time elapsed ( +- 0.09% )
After:
6312.317533 task-clock (msec) # 1.001 CPUs utilized ( +- 0.19% )
221 context-switches # 0.035 K/sec ( +- 4.11% )
1 cpu-migrations # 0.000 K/sec ( +- 45.21% )
1,280,775 page-faults # 0.203 M/sec ( +- 0.37% )
17,611,539,150 cycles # 2.790 GHz ( +- 0.19% )
10,285,148,569 stalled-cycles-frontend # 58.40% frontend cycles idle ( +- 0.30% )
<not supported> stalled-cycles-backend
18,794,779,900 instructions # 1.07 insns per cycle
# 0.55 stalled cycles per insn ( +- 0.03% )
3,287,450,865 branches # 520.799 M/sec ( +- 0.03% )
72,259,605 branch-misses # 2.20% of all branches ( +- 0.01% )
6.307411828 seconds time elapsed ( +- 0.19% )
Differential Revision: http://reviews.llvm.org/D20645
llvm-svn: 270999
2016-05-27 22:39:13 +08:00
|
|
|
// Returns the offset in an output section for a given input offset.
|
|
|
|
// Because contents of a mergeable section is not contiguous in output,
|
|
|
|
// it is not just an addition to a base output offset.
|
2018-04-20 00:05:07 +08:00
|
|
|
uint64_t MergeInputSection::getParentOffset(uint64_t offset) const {
|
2017-11-01 03:14:06 +08:00
|
|
|
// If Offset is not at beginning of a section piece, it is not in the map.
|
|
|
|
// In that case we need to search from the original section piece vector.
|
2021-03-01 08:42:49 +08:00
|
|
|
const SectionPiece &piece = *getSectionPiece(offset);
|
2017-03-07 04:23:56 +08:00
|
|
|
uint64_t addend = offset - piece.inputOff;
|
2016-05-29 02:40:38 +08:00
|
|
|
return piece.outputOff + addend;
|
Avoid doing binary search.
MergedInputSection::getOffset is the busiest function in LLD if string
merging is enabled and input files have lots of mergeable sections.
It is usually the case when creating executable with debug info,
so it is pretty common.
The reason why it is slow is because it has to do faily complex
computations. For non-mergeable sections, section contents are
contiguous in output, so in order to compute an output offset,
we only have to add the output section's base address to an input
offset. But for mergeable strings, section contents are split for
merging, so they are not contigous. We've got to do some lookups.
We used to do binary search on the list of section pieces.
It is slow because I think it's hostile to branch prediction.
This patch replaces it with hash table lookup. Seems it's working
pretty well. Below is "perf stat -r10" output when linking clang
with debug info. In this case this patch speeds up about 4%.
Before:
6584.153205 task-clock (msec) # 1.001 CPUs utilized ( +- 0.09% )
238 context-switches # 0.036 K/sec ( +- 6.59% )
0 cpu-migrations # 0.000 K/sec ( +- 50.92% )
1,067,675 page-faults # 0.162 M/sec ( +- 0.15% )
18,369,931,470 cycles # 2.790 GHz ( +- 0.09% )
9,640,680,143 stalled-cycles-frontend # 52.48% frontend cycles idle ( +- 0.18% )
<not supported> stalled-cycles-backend
21,206,747,787 instructions # 1.15 insns per cycle
# 0.45 stalled cycles per insn ( +- 0.04% )
3,817,398,032 branches # 579.786 M/sec ( +- 0.04% )
132,787,249 branch-misses # 3.48% of all branches ( +- 0.02% )
6.579106511 seconds time elapsed ( +- 0.09% )
After:
6312.317533 task-clock (msec) # 1.001 CPUs utilized ( +- 0.19% )
221 context-switches # 0.035 K/sec ( +- 4.11% )
1 cpu-migrations # 0.000 K/sec ( +- 45.21% )
1,280,775 page-faults # 0.203 M/sec ( +- 0.37% )
17,611,539,150 cycles # 2.790 GHz ( +- 0.19% )
10,285,148,569 stalled-cycles-frontend # 58.40% frontend cycles idle ( +- 0.30% )
<not supported> stalled-cycles-backend
18,794,779,900 instructions # 1.07 insns per cycle
# 0.55 stalled cycles per insn ( +- 0.03% )
3,287,450,865 branches # 520.799 M/sec ( +- 0.03% )
72,259,605 branch-misses # 2.20% of all branches ( +- 0.01% )
6.307411828 seconds time elapsed ( +- 0.19% )
Differential Revision: http://reviews.llvm.org/D20645
llvm-svn: 270999
2016-05-27 22:39:13 +08:00
|
|
|
}
|
|
|
|
|
2017-12-21 10:03:39 +08:00
|
|
|
template InputSection::InputSection(ObjFile<ELF32LE> &, const ELF32LE::Shdr &,
|
2017-07-27 06:13:32 +08:00
|
|
|
StringRef);
|
2017-12-21 10:03:39 +08:00
|
|
|
template InputSection::InputSection(ObjFile<ELF32BE> &, const ELF32BE::Shdr &,
|
2017-07-27 06:13:32 +08:00
|
|
|
StringRef);
|
2017-12-21 10:03:39 +08:00
|
|
|
template InputSection::InputSection(ObjFile<ELF64LE> &, const ELF64LE::Shdr &,
|
2017-07-27 06:13:32 +08:00
|
|
|
StringRef);
|
2017-12-21 10:03:39 +08:00
|
|
|
template InputSection::InputSection(ObjFile<ELF64BE> &, const ELF64BE::Shdr &,
|
2017-07-27 06:13:32 +08:00
|
|
|
StringRef);
|
2017-03-30 04:15:29 +08:00
|
|
|
|
|
|
|
template void InputSection::writeTo<ELF32LE>(uint8_t *);
|
|
|
|
template void InputSection::writeTo<ELF32BE>(uint8_t *);
|
|
|
|
template void InputSection::writeTo<ELF64LE>(uint8_t *);
|
|
|
|
template void InputSection::writeTo<ELF64BE>(uint8_t *);
|
2016-02-28 08:25:54 +08:00
|
|
|
|
2021-10-28 00:51:06 +08:00
|
|
|
template RelsOrRelas<ELF32LE> InputSectionBase::relsOrRelas<ELF32LE>() const;
|
|
|
|
template RelsOrRelas<ELF32BE> InputSectionBase::relsOrRelas<ELF32BE>() const;
|
|
|
|
template RelsOrRelas<ELF64LE> InputSectionBase::relsOrRelas<ELF64LE>() const;
|
|
|
|
template RelsOrRelas<ELF64BE> InputSectionBase::relsOrRelas<ELF64BE>() const;
|
|
|
|
|
2017-12-21 10:03:39 +08:00
|
|
|
template MergeInputSection::MergeInputSection(ObjFile<ELF32LE> &,
|
|
|
|
const ELF32LE::Shdr &, StringRef);
|
|
|
|
template MergeInputSection::MergeInputSection(ObjFile<ELF32BE> &,
|
|
|
|
const ELF32BE::Shdr &, StringRef);
|
|
|
|
template MergeInputSection::MergeInputSection(ObjFile<ELF64LE> &,
|
|
|
|
const ELF64LE::Shdr &, StringRef);
|
|
|
|
template MergeInputSection::MergeInputSection(ObjFile<ELF64BE> &,
|
|
|
|
const ELF64BE::Shdr &, StringRef);
|
|
|
|
|
|
|
|
template EhInputSection::EhInputSection(ObjFile<ELF32LE> &,
|
|
|
|
const ELF32LE::Shdr &, StringRef);
|
|
|
|
template EhInputSection::EhInputSection(ObjFile<ELF32BE> &,
|
|
|
|
const ELF32BE::Shdr &, StringRef);
|
|
|
|
template EhInputSection::EhInputSection(ObjFile<ELF64LE> &,
|
|
|
|
const ELF64LE::Shdr &, StringRef);
|
|
|
|
template EhInputSection::EhInputSection(ObjFile<ELF64BE> &,
|
|
|
|
const ELF64BE::Shdr &, StringRef);
|
2017-03-07 05:17:18 +08:00
|
|
|
|
|
|
|
template void EhInputSection::split<ELF32LE>();
|
|
|
|
template void EhInputSection::split<ELF32BE>();
|
|
|
|
template void EhInputSection::split<ELF64LE>();
|
|
|
|
template void EhInputSection::split<ELF64BE>();
|