2012-02-18 20:03:15 +08:00
|
|
|
//===-- PPCAsmPrinter.cpp - Print machine instrs to PowerPC assembly ------===//
|
2005-04-22 07:30:14 +08:00
|
|
|
//
|
2004-06-22 00:55:25 +08:00
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 04:36:04 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2005-04-22 07:30:14 +08:00
|
|
|
//
|
2004-06-22 00:55:25 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2004-07-09 01:58:04 +08:00
|
|
|
// This file contains a printer that converts from our internal representation
|
|
|
|
// of machine-dependent LLVM code to PowerPC assembly language. This printer is
|
2004-07-29 04:18:53 +08:00
|
|
|
// the output mechanism used by `llc'.
|
2004-06-22 00:55:25 +08:00
|
|
|
//
|
2004-07-09 01:58:04 +08:00
|
|
|
// Documentation at http://developer.apple.com/documentation/DeveloperTools/
|
|
|
|
// Reference/Assembler/ASMIntroduction/chapter_1_section_1.html
|
2004-06-30 01:13:26 +08:00
|
|
|
//
|
2004-06-22 00:55:25 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2012-03-18 02:46:09 +08:00
|
|
|
#include "InstPrinter/PPCInstPrinter.h"
|
2013-05-24 06:26:41 +08:00
|
|
|
#include "MCTargetDesc/PPCMCExpr.h"
|
2016-12-13 06:23:53 +08:00
|
|
|
#include "MCTargetDesc/PPCMCTargetDesc.h"
|
2017-06-06 19:49:48 +08:00
|
|
|
#include "PPC.h"
|
|
|
|
#include "PPCInstrInfo.h"
|
2015-01-14 19:23:27 +08:00
|
|
|
#include "PPCMachineFunctionInfo.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "PPCSubtarget.h"
|
|
|
|
#include "PPCTargetMachine.h"
|
2013-10-08 21:08:17 +08:00
|
|
|
#include "PPCTargetStreamer.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/ADT/MapVector.h"
|
2016-12-13 06:23:53 +08:00
|
|
|
#include "llvm/ADT/StringRef.h"
|
|
|
|
#include "llvm/ADT/Triple.h"
|
|
|
|
#include "llvm/ADT/Twine.h"
|
2017-06-07 11:48:56 +08:00
|
|
|
#include "llvm/BinaryFormat/ELF.h"
|
|
|
|
#include "llvm/BinaryFormat/MachO.h"
|
2004-08-17 07:25:21 +08:00
|
|
|
#include "llvm/CodeGen/AsmPrinter.h"
|
2016-12-13 06:23:53 +08:00
|
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
2004-06-22 00:55:25 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstr.h"
|
2010-01-21 05:16:14 +08:00
|
|
|
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
|
2016-12-13 06:23:53 +08:00
|
|
|
#include "llvm/CodeGen/MachineOperand.h"
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support""
This re-applies r225808, fixed to avoid problems with SDAG dependencies along
with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs.
These problems caused the original regression tests to assert/segfault on many
(but not all) systems.
Original commit message:
This commit does two things:
1. Refactors PPCFastISel to use more of the common infrastructure for call
lowering (this lets us take advantage of this common code for lowering some
common intrinsics, stackmap/patchpoint among them).
2. Adds support for stackmap/patchpoint lowering. For the most part, this is
very similar to the support in the AArch64 target, with the obvious differences
(different registers, NOP instructions, etc.). The test cases are adapted
from the AArch64 test cases.
One difference of note is that the patchpoint call sequence takes 24 bytes, so
you can't use less than that (on AArch64 you can go down to 16). Also, as noted
in the docs, we take the patchpoint address to be the actual code address
(assuming the call is local in the TOC-sharing sense), which should yield
higher performance than generating the full cross-DSO indirect-call sequence
and is likely just as useful for JITed code (if not, we'll change it).
StackMaps and Patchpoints are still marked as experimental, and so this support
is doubly experimental. So go ahead and experiment!
llvm-svn: 225909
2015-01-14 09:07:51 +08:00
|
|
|
#include "llvm/CodeGen/StackMaps.h"
|
2010-02-16 06:37:53 +08:00
|
|
|
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
|
2016-12-13 06:23:53 +08:00
|
|
|
#include "llvm/IR/DataLayout.h"
|
|
|
|
#include "llvm/IR/GlobalValue.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/Module.h"
|
2009-09-14 01:14:04 +08:00
|
|
|
#include "llvm/MC/MCAsmInfo.h"
|
2010-01-14 03:00:57 +08:00
|
|
|
#include "llvm/MC/MCContext.h"
|
2010-03-12 07:39:44 +08:00
|
|
|
#include "llvm/MC/MCExpr.h"
|
2010-11-15 03:53:02 +08:00
|
|
|
#include "llvm/MC/MCInst.h"
|
2012-11-26 21:34:22 +08:00
|
|
|
#include "llvm/MC/MCInstBuilder.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/MC/MCSectionELF.h"
|
2009-08-11 02:15:01 +08:00
|
|
|
#include "llvm/MC/MCSectionMachO.h"
|
2009-08-19 13:49:37 +08:00
|
|
|
#include "llvm/MC/MCStreamer.h"
|
2016-12-13 06:23:53 +08:00
|
|
|
#include "llvm/MC/MCSymbol.h"
|
2015-06-03 04:38:46 +08:00
|
|
|
#include "llvm/MC/MCSymbolELF.h"
|
2016-12-13 06:23:53 +08:00
|
|
|
#include "llvm/MC/SectionKind.h"
|
|
|
|
#include "llvm/Support/Casting.h"
|
|
|
|
#include "llvm/Support/CodeGen.h"
|
2010-08-05 06:07:50 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2009-07-09 04:53:28 +08:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
2016-12-13 06:23:53 +08:00
|
|
|
#include "llvm/Support/TargetRegistry.h"
|
2017-06-06 19:49:48 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2016-12-13 06:23:53 +08:00
|
|
|
#include "llvm/Target/TargetMachine.h"
|
|
|
|
#include <algorithm>
|
|
|
|
#include <cassert>
|
|
|
|
#include <cstdint>
|
|
|
|
#include <memory>
|
|
|
|
#include <new>
|
|
|
|
|
2004-08-17 07:25:21 +08:00
|
|
|
using namespace llvm;
|
2004-06-22 00:55:25 +08:00
|
|
|
|
2014-04-22 10:41:26 +08:00
|
|
|
#define DEBUG_TYPE "asmprinter"
|
|
|
|
|
2006-12-20 06:59:26 +08:00
|
|
|
namespace {
|
2016-12-13 06:23:53 +08:00
|
|
|
|
2015-09-22 19:13:55 +08:00
|
|
|
class PPCAsmPrinter : public AsmPrinter {
|
|
|
|
protected:
|
|
|
|
MapVector<MCSymbol *, MCSymbol *> TOC;
|
|
|
|
const PPCSubtarget *Subtarget;
|
|
|
|
StackMaps SM;
|
|
|
|
|
|
|
|
public:
|
|
|
|
explicit PPCAsmPrinter(TargetMachine &TM,
|
|
|
|
std::unique_ptr<MCStreamer> Streamer)
|
|
|
|
: AsmPrinter(TM, std::move(Streamer)), SM(*this) {}
|
|
|
|
|
2016-10-01 10:56:57 +08:00
|
|
|
StringRef getPassName() const override { return "PowerPC Assembly Printer"; }
|
2004-06-22 00:55:25 +08:00
|
|
|
|
2016-10-01 10:56:57 +08:00
|
|
|
MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym);
|
2004-08-15 06:09:10 +08:00
|
|
|
|
2016-12-13 06:23:53 +08:00
|
|
|
bool doInitialization(Module &M) override {
|
2016-10-01 10:56:57 +08:00
|
|
|
if (!TOC.empty())
|
|
|
|
TOC.clear();
|
|
|
|
return AsmPrinter::doInitialization(M);
|
2016-12-13 06:23:53 +08:00
|
|
|
}
|
2016-03-12 18:23:07 +08:00
|
|
|
|
2014-04-29 15:57:37 +08:00
|
|
|
void EmitInstruction(const MachineInstr *MI) override;
|
2004-08-15 07:27:29 +08:00
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2006-02-02 06:38:46 +08:00
|
|
|
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
|
2010-04-04 13:29:35 +08:00
|
|
|
unsigned AsmVariant, const char *ExtraCode,
|
2014-04-29 15:57:37 +08:00
|
|
|
raw_ostream &O) override;
|
2006-02-25 04:27:40 +08:00
|
|
|
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
|
2010-04-04 13:29:35 +08:00
|
|
|
unsigned AsmVariant, const char *ExtraCode,
|
2014-04-29 15:57:37 +08:00
|
|
|
raw_ostream &O) override;
|
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support""
This re-applies r225808, fixed to avoid problems with SDAG dependencies along
with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs.
These problems caused the original regression tests to assert/segfault on many
(but not all) systems.
Original commit message:
This commit does two things:
1. Refactors PPCFastISel to use more of the common infrastructure for call
lowering (this lets us take advantage of this common code for lowering some
common intrinsics, stackmap/patchpoint among them).
2. Adds support for stackmap/patchpoint lowering. For the most part, this is
very similar to the support in the AArch64 target, with the obvious differences
(different registers, NOP instructions, etc.). The test cases are adapted
from the AArch64 test cases.
One difference of note is that the patchpoint call sequence takes 24 bytes, so
you can't use less than that (on AArch64 you can go down to 16). Also, as noted
in the docs, we take the patchpoint address to be the actual code address
(assuming the call is local in the TOC-sharing sense), which should yield
higher performance than generating the full cross-DSO indirect-call sequence
and is likely just as useful for JITed code (if not, we'll change it).
StackMaps and Patchpoints are still marked as experimental, and so this support
is doubly experimental. So go ahead and experiment!
llvm-svn: 225909
2015-01-14 09:07:51 +08:00
|
|
|
|
|
|
|
void EmitEndOfAsmFile(Module &M) override;
|
|
|
|
|
2015-12-12 09:47:08 +08:00
|
|
|
void LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI);
|
|
|
|
void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI);
|
2015-02-11 03:09:05 +08:00
|
|
|
void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK);
|
2015-02-17 15:21:21 +08:00
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
|
|
|
Subtarget = &MF.getSubtarget<PPCSubtarget>();
|
[XRay] Implement powerpc64le xray.
Summary:
powerpc64 big-endian is not supported, but I believe that most logic can
be shared, except for xray_powerpc64.cc.
Also add a function InvalidateInstructionCache to xray_util.h, which is
copied from llvm/Support/Memory.cpp. I'm not sure if I need to add a unittest,
and I don't know how.
Reviewers: dberris, echristo, iteratee, kbarton, hfinkel
Subscribers: mehdi_amini, nemanjai, mgorny, llvm-commits
Differential Revision: https://reviews.llvm.org/D29742
llvm-svn: 294781
2017-02-11 05:03:24 +08:00
|
|
|
bool Changed = AsmPrinter::runOnMachineFunction(MF);
|
|
|
|
emitXRayTable();
|
|
|
|
return Changed;
|
2015-02-17 15:21:21 +08:00
|
|
|
}
|
2004-09-04 13:00:00 +08:00
|
|
|
};
|
2005-04-22 07:30:14 +08:00
|
|
|
|
2008-08-09 02:22:59 +08:00
|
|
|
/// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
|
2009-10-25 14:33:48 +08:00
|
|
|
class PPCLinuxAsmPrinter : public PPCAsmPrinter {
|
2009-02-24 16:30:20 +08:00
|
|
|
public:
|
2015-01-19 04:29:04 +08:00
|
|
|
explicit PPCLinuxAsmPrinter(TargetMachine &TM,
|
|
|
|
std::unique_ptr<MCStreamer> Streamer)
|
|
|
|
: PPCAsmPrinter(TM, std::move(Streamer)) {}
|
2006-12-22 04:26:09 +08:00
|
|
|
|
2016-10-01 10:56:57 +08:00
|
|
|
StringRef getPassName() const override {
|
2006-12-22 04:26:09 +08:00
|
|
|
return "Linux PPC Assembly Printer";
|
|
|
|
}
|
|
|
|
|
2014-04-29 15:57:37 +08:00
|
|
|
bool doFinalization(Module &M) override;
|
2014-07-19 07:29:49 +08:00
|
|
|
void EmitStartOfAsmFile(Module &M) override;
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2014-04-29 15:57:37 +08:00
|
|
|
void EmitFunctionEntryLabel() override;
|
2012-08-29 03:06:55 +08:00
|
|
|
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
void EmitFunctionBodyStart() override;
|
2014-04-29 15:57:37 +08:00
|
|
|
void EmitFunctionBodyEnd() override;
|
[XRay] Implement powerpc64le xray.
Summary:
powerpc64 big-endian is not supported, but I believe that most logic can
be shared, except for xray_powerpc64.cc.
Also add a function InvalidateInstructionCache to xray_util.h, which is
copied from llvm/Support/Memory.cpp. I'm not sure if I need to add a unittest,
and I don't know how.
Reviewers: dberris, echristo, iteratee, kbarton, hfinkel
Subscribers: mehdi_amini, nemanjai, mgorny, llvm-commits
Differential Revision: https://reviews.llvm.org/D29742
llvm-svn: 294781
2017-02-11 05:03:24 +08:00
|
|
|
void EmitInstruction(const MachineInstr *MI) override;
|
2006-12-22 04:26:09 +08:00
|
|
|
};
|
|
|
|
|
2008-08-09 02:22:59 +08:00
|
|
|
/// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
|
|
|
|
/// OS X
|
2009-10-25 14:33:48 +08:00
|
|
|
class PPCDarwinAsmPrinter : public PPCAsmPrinter {
|
2009-02-24 16:30:20 +08:00
|
|
|
public:
|
2015-01-19 04:29:04 +08:00
|
|
|
explicit PPCDarwinAsmPrinter(TargetMachine &TM,
|
|
|
|
std::unique_ptr<MCStreamer> Streamer)
|
|
|
|
: PPCAsmPrinter(TM, std::move(Streamer)) {}
|
2004-09-04 13:00:00 +08:00
|
|
|
|
2016-10-01 10:56:57 +08:00
|
|
|
StringRef getPassName() const override {
|
2004-09-04 13:00:00 +08:00
|
|
|
return "Darwin PPC Assembly Printer";
|
|
|
|
}
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2014-04-29 15:57:37 +08:00
|
|
|
bool doFinalization(Module &M) override;
|
|
|
|
void EmitStartOfAsmFile(Module &M) override;
|
2004-06-22 00:55:25 +08:00
|
|
|
};
|
2016-12-13 06:23:53 +08:00
|
|
|
|
|
|
|
} // end anonymous namespace
|
2004-06-22 00:55:25 +08:00
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
/// stripRegisterPrefix - This method strips the character prefix from a
|
|
|
|
/// register name so that only the number is left. Used by for linux asm.
|
|
|
|
static const char *stripRegisterPrefix(const char *RegName) {
|
|
|
|
switch (RegName[0]) {
|
|
|
|
case 'r':
|
|
|
|
case 'f':
|
[PowerPC] Add support for the QPX vector instruction set
This adds support for the QPX vector instruction set, which is used by the
enhanced A2 cores on the IBM BG/Q supercomputers. QPX vectors are 256 bytes
wide, holding 4 double-precision floating-point values. Boolean values, modeled
here as <4 x i1> are actually also represented as floating-point values
(essentially { -1, 1 } for { false, true }). QPX shares many features with
Altivec and VSX, but is distinct from both of them. One major difference is
that, instead of adding completely-separate vector registers, QPX vector
registers are extensions of the scalar floating-point registers (lane 0 is the
corresponding scalar floating-point value). The operations supported on QPX
vectors mirrors that supported on the scalar floating-point values (with some
additional ones for permutations and logical/comparison operations).
I've been maintaining this support out-of-tree, as part of the bgclang project,
for several years. This is not the entire bgclang patch set, but is most of the
subset that can be cleanly integrated into LLVM proper at this time. Adding
this to the LLVM backend is part of my efforts to rebase bgclang to the current
LLVM trunk, but is independently useful (especially for codes that use LLVM as
a JIT in library form).
The assembler/disassembler test coverage is complete. The CodeGen test coverage
is not, but I've included some tests, and more will be added as follow-up work.
llvm-svn: 230413
2015-02-25 09:06:45 +08:00
|
|
|
case 'q': // for QPX
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
case 'v':
|
|
|
|
if (RegName[1] == 's')
|
|
|
|
return RegName + 2;
|
|
|
|
return RegName + 1;
|
2010-11-15 11:39:06 +08:00
|
|
|
case 'c': if (RegName[1] == 'r') return RegName + 2;
|
|
|
|
}
|
2015-09-22 19:14:39 +08:00
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
return RegName;
|
|
|
|
}
|
2004-09-04 13:00:00 +08:00
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
|
|
|
|
raw_ostream &O) {
|
2015-07-16 14:11:10 +08:00
|
|
|
const DataLayout &DL = getDataLayout();
|
2010-11-15 11:39:06 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(OpNo);
|
2015-09-22 19:14:39 +08:00
|
|
|
|
2004-06-22 00:55:25 +08:00
|
|
|
switch (MO.getType()) {
|
2010-11-15 11:39:06 +08:00
|
|
|
case MachineOperand::MO_Register: {
|
2016-10-04 14:59:23 +08:00
|
|
|
unsigned Reg = MO.getReg();
|
|
|
|
|
|
|
|
// There are VSX instructions that use VSX register numbering (vs0 - vs63)
|
|
|
|
// as well as those that use VMX register numbering (v0 - v31 which
|
|
|
|
// correspond to vs32 - vs63). If we have an instruction that uses VSX
|
|
|
|
// numbering, we need to convert the VMX registers to VSX registers.
|
|
|
|
// Namely, we print 32-63 when the instruction operates on one of the
|
|
|
|
// VMX registers.
|
|
|
|
// (Please synchronize with PPCInstPrinter::printOperand)
|
|
|
|
if (MI->getDesc().TSFlags & PPCII::UseVSXReg) {
|
|
|
|
if (PPCInstrInfo::isVRRegister(Reg))
|
|
|
|
Reg = PPC::VSX32 + (Reg - PPC::V0);
|
|
|
|
else if (PPCInstrInfo::isVFRegister(Reg))
|
|
|
|
Reg = PPC::VSX32 + (Reg - PPC::VF0);
|
|
|
|
}
|
|
|
|
const char *RegName = PPCInstPrinter::getRegisterName(Reg);
|
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
// Linux assembler (Others?) does not take register mnemonics.
|
|
|
|
// FIXME - What about special registers used in mfspr/mtspr?
|
2015-02-10 08:44:17 +08:00
|
|
|
if (!Subtarget->isDarwin())
|
|
|
|
RegName = stripRegisterPrefix(RegName);
|
2010-11-15 11:39:06 +08:00
|
|
|
O << RegName;
|
|
|
|
return;
|
|
|
|
}
|
2006-05-05 01:21:20 +08:00
|
|
|
case MachineOperand::MO_Immediate:
|
2010-11-15 11:39:06 +08:00
|
|
|
O << MO.getImm();
|
|
|
|
return;
|
2004-07-28 08:00:48 +08:00
|
|
|
|
2006-04-23 02:53:45 +08:00
|
|
|
case MachineOperand::MO_MachineBasicBlock:
|
2015-06-09 08:31:39 +08:00
|
|
|
MO.getMBB()->getSymbol()->print(O, MAI);
|
2006-04-23 02:53:45 +08:00
|
|
|
return;
|
2004-07-09 01:58:04 +08:00
|
|
|
case MachineOperand::MO_ConstantPoolIndex:
|
2015-07-16 14:11:10 +08:00
|
|
|
O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
|
|
|
|
<< MO.getIndex();
|
2004-06-22 00:55:25 +08:00
|
|
|
return;
|
2009-11-05 05:31:18 +08:00
|
|
|
case MachineOperand::MO_BlockAddress:
|
2015-06-09 08:31:39 +08:00
|
|
|
GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI);
|
2009-11-05 05:31:18 +08:00
|
|
|
return;
|
2004-08-13 17:32:01 +08:00
|
|
|
case MachineOperand::MO_GlobalAddress: {
|
2005-12-16 08:22:14 +08:00
|
|
|
// Computing the address of a global symbol, not calling it.
|
2010-04-15 09:51:59 +08:00
|
|
|
const GlobalValue *GV = MO.getGlobal();
|
2010-01-16 10:00:23 +08:00
|
|
|
MCSymbol *SymToPrint;
|
2004-08-13 17:32:01 +08:00
|
|
|
|
2004-10-18 07:01:34 +08:00
|
|
|
// External or weakly linked global variables need non-lazily-resolved stubs
|
2016-06-24 21:28:26 +08:00
|
|
|
if (Subtarget->hasLazyResolverStub(GV)) {
|
2016-06-24 21:08:06 +08:00
|
|
|
SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
|
|
|
|
MachineModuleInfoImpl::StubValueTy &StubSym =
|
|
|
|
MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(
|
|
|
|
SymToPrint);
|
|
|
|
if (!StubSym.getPointer())
|
|
|
|
StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
|
|
|
|
!GV->hasInternalLinkage());
|
2009-07-15 09:14:44 +08:00
|
|
|
} else {
|
2013-10-30 01:07:16 +08:00
|
|
|
SymToPrint = getSymbol(GV);
|
2004-06-22 00:55:25 +08:00
|
|
|
}
|
2015-06-09 08:31:39 +08:00
|
|
|
|
|
|
|
SymToPrint->print(O, MAI);
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2010-04-04 06:28:33 +08:00
|
|
|
printOffset(MO.getOffset(), O);
|
2004-06-22 00:55:25 +08:00
|
|
|
return;
|
2004-08-13 17:32:01 +08:00
|
|
|
}
|
2005-04-22 07:30:14 +08:00
|
|
|
|
2004-06-22 00:55:25 +08:00
|
|
|
default:
|
2014-05-17 07:28:17 +08:00
|
|
|
O << "<unknown operand type: " << (unsigned)MO.getType() << ">";
|
2004-06-25 23:11:34 +08:00
|
|
|
return;
|
2004-06-22 00:55:25 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-02-24 03:31:10 +08:00
|
|
|
/// PrintAsmOperand - Print out an operand for an inline asm expression.
|
|
|
|
///
|
|
|
|
bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
|
2008-08-09 02:22:59 +08:00
|
|
|
unsigned AsmVariant,
|
2010-04-04 13:29:35 +08:00
|
|
|
const char *ExtraCode, raw_ostream &O) {
|
2006-02-24 03:31:10 +08:00
|
|
|
// Does this asm operand have a single letter operand modifier?
|
|
|
|
if (ExtraCode && ExtraCode[0]) {
|
|
|
|
if (ExtraCode[1] != 0) return true; // Unknown modifier.
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2006-02-24 03:31:10 +08:00
|
|
|
switch (ExtraCode[0]) {
|
2012-06-26 21:49:27 +08:00
|
|
|
default:
|
|
|
|
// See if this is a generic print operand
|
|
|
|
return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
|
2007-01-25 10:52:50 +08:00
|
|
|
case 'c': // Don't print "$" before a global var name or constant.
|
2010-11-15 11:39:06 +08:00
|
|
|
break; // PPC never has a prefix.
|
2008-08-09 02:22:59 +08:00
|
|
|
case 'L': // Write second word of DImode reference.
|
2006-02-24 03:31:10 +08:00
|
|
|
// Verify that this operand has two consecutive registers.
|
2008-10-03 23:45:36 +08:00
|
|
|
if (!MI->getOperand(OpNo).isReg() ||
|
2006-02-24 03:31:10 +08:00
|
|
|
OpNo+1 == MI->getNumOperands() ||
|
2008-10-03 23:45:36 +08:00
|
|
|
!MI->getOperand(OpNo+1).isReg())
|
2006-02-24 03:31:10 +08:00
|
|
|
return true;
|
|
|
|
++OpNo; // Return the high-part.
|
|
|
|
break;
|
2007-04-25 06:51:03 +08:00
|
|
|
case 'I':
|
|
|
|
// Write 'i' if an integer constant, otherwise nothing. Used to print
|
|
|
|
// addi vs add, etc.
|
2008-10-03 23:45:36 +08:00
|
|
|
if (MI->getOperand(OpNo).isImm())
|
2007-04-25 06:51:03 +08:00
|
|
|
O << "i";
|
|
|
|
return false;
|
2006-02-24 03:31:10 +08:00
|
|
|
}
|
|
|
|
}
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2010-04-04 12:47:45 +08:00
|
|
|
printOperand(MI, OpNo, O);
|
2006-02-24 03:31:10 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2009-08-18 08:18:39 +08:00
|
|
|
// At the moment, all inline asm memory operands are a single register.
|
|
|
|
// In any case, the output of this routine should always be just one
|
|
|
|
// assembler operand.
|
|
|
|
|
2006-02-25 04:27:40 +08:00
|
|
|
bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
|
2008-08-09 02:22:59 +08:00
|
|
|
unsigned AsmVariant,
|
2010-04-04 13:29:35 +08:00
|
|
|
const char *ExtraCode,
|
|
|
|
raw_ostream &O) {
|
2012-11-06 02:18:42 +08:00
|
|
|
if (ExtraCode && ExtraCode[0]) {
|
|
|
|
if (ExtraCode[1] != 0) return true; // Unknown modifier.
|
|
|
|
|
|
|
|
switch (ExtraCode[0]) {
|
|
|
|
default: return true; // Unknown modifier.
|
|
|
|
case 'y': // A memory reference for an X-form instruction
|
|
|
|
{
|
|
|
|
const char *RegName = "r0";
|
2015-02-10 08:44:17 +08:00
|
|
|
if (!Subtarget->isDarwin())
|
|
|
|
RegName = stripRegisterPrefix(RegName);
|
2012-11-06 02:18:42 +08:00
|
|
|
O << RegName << ", ";
|
|
|
|
printOperand(MI, OpNo, O);
|
|
|
|
return false;
|
|
|
|
}
|
2014-09-12 04:10:03 +08:00
|
|
|
case 'U': // Print 'u' for update form.
|
|
|
|
case 'X': // Print 'x' for indexed form.
|
2015-09-22 19:15:07 +08:00
|
|
|
{
|
|
|
|
// FIXME: Currently for PowerPC memory operands are always loaded
|
|
|
|
// into a register, so we never get an update or indexed form.
|
|
|
|
// This is bad even for offset forms, since even if we know we
|
|
|
|
// have a value in -16(r1), we will generate a load into r<n>
|
|
|
|
// and then load from 0(r<n>). Until that issue is fixed,
|
|
|
|
// tolerate 'U' and 'X' but don't output anything.
|
|
|
|
assert(MI->getOperand(OpNo).isReg());
|
|
|
|
return false;
|
|
|
|
}
|
2012-11-06 02:18:42 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
assert(MI->getOperand(OpNo).isReg());
|
2009-08-27 02:10:32 +08:00
|
|
|
O << "0(";
|
2010-04-04 12:47:45 +08:00
|
|
|
printOperand(MI, OpNo, O);
|
2009-08-27 02:10:32 +08:00
|
|
|
O << ")";
|
2006-02-25 04:27:40 +08:00
|
|
|
return false;
|
|
|
|
}
|
2006-02-24 03:31:10 +08:00
|
|
|
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
/// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry
|
|
|
|
/// exists for it. If not, create one. Then return a symbol that references
|
|
|
|
/// the TOC entry.
|
|
|
|
MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
|
|
|
|
MCSymbol *&TOCEntry = TOC[Sym];
|
2015-03-17 22:50:32 +08:00
|
|
|
if (!TOCEntry)
|
2015-03-18 04:07:06 +08:00
|
|
|
TOCEntry = createTempSymbol("C");
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
return TOCEntry;
|
|
|
|
}
|
|
|
|
|
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support""
This re-applies r225808, fixed to avoid problems with SDAG dependencies along
with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs.
These problems caused the original regression tests to assert/segfault on many
(but not all) systems.
Original commit message:
This commit does two things:
1. Refactors PPCFastISel to use more of the common infrastructure for call
lowering (this lets us take advantage of this common code for lowering some
common intrinsics, stackmap/patchpoint among them).
2. Adds support for stackmap/patchpoint lowering. For the most part, this is
very similar to the support in the AArch64 target, with the obvious differences
(different registers, NOP instructions, etc.). The test cases are adapted
from the AArch64 test cases.
One difference of note is that the patchpoint call sequence takes 24 bytes, so
you can't use less than that (on AArch64 you can go down to 16). Also, as noted
in the docs, we take the patchpoint address to be the actual code address
(assuming the call is local in the TOC-sharing sense), which should yield
higher performance than generating the full cross-DSO indirect-call sequence
and is likely just as useful for JITed code (if not, we'll change it).
StackMaps and Patchpoints are still marked as experimental, and so this support
is doubly experimental. So go ahead and experiment!
llvm-svn: 225909
2015-01-14 09:07:51 +08:00
|
|
|
void PPCAsmPrinter::EmitEndOfAsmFile(Module &M) {
|
|
|
|
SM.serializeToStackMapSection();
|
|
|
|
}
|
|
|
|
|
2015-12-12 09:47:08 +08:00
|
|
|
void PPCAsmPrinter::LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI) {
|
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support""
This re-applies r225808, fixed to avoid problems with SDAG dependencies along
with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs.
These problems caused the original regression tests to assert/segfault on many
(but not all) systems.
Original commit message:
This commit does two things:
1. Refactors PPCFastISel to use more of the common infrastructure for call
lowering (this lets us take advantage of this common code for lowering some
common intrinsics, stackmap/patchpoint among them).
2. Adds support for stackmap/patchpoint lowering. For the most part, this is
very similar to the support in the AArch64 target, with the obvious differences
(different registers, NOP instructions, etc.). The test cases are adapted
from the AArch64 test cases.
One difference of note is that the patchpoint call sequence takes 24 bytes, so
you can't use less than that (on AArch64 you can go down to 16). Also, as noted
in the docs, we take the patchpoint address to be the actual code address
(assuming the call is local in the TOC-sharing sense), which should yield
higher performance than generating the full cross-DSO indirect-call sequence
and is likely just as useful for JITed code (if not, we'll change it).
StackMaps and Patchpoints are still marked as experimental, and so this support
is doubly experimental. So go ahead and experiment!
llvm-svn: 225909
2015-01-14 09:07:51 +08:00
|
|
|
unsigned NumNOPBytes = MI.getOperand(1).getImm();
|
|
|
|
|
|
|
|
SM.recordStackMap(MI);
|
|
|
|
assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
|
|
|
|
|
|
|
|
// Scan ahead to trim the shadow.
|
|
|
|
const MachineBasicBlock &MBB = *MI.getParent();
|
|
|
|
MachineBasicBlock::const_iterator MII(MI);
|
|
|
|
++MII;
|
|
|
|
while (NumNOPBytes > 0) {
|
|
|
|
if (MII == MBB.end() || MII->isCall() ||
|
|
|
|
MII->getOpcode() == PPC::DBG_VALUE ||
|
|
|
|
MII->getOpcode() == TargetOpcode::PATCHPOINT ||
|
|
|
|
MII->getOpcode() == TargetOpcode::STACKMAP)
|
|
|
|
break;
|
|
|
|
++MII;
|
|
|
|
NumNOPBytes -= 4;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Emit nops.
|
|
|
|
for (unsigned i = 0; i < NumNOPBytes; i += 4)
|
2015-12-12 09:47:08 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
|
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support""
This re-applies r225808, fixed to avoid problems with SDAG dependencies along
with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs.
These problems caused the original regression tests to assert/segfault on many
(but not all) systems.
Original commit message:
This commit does two things:
1. Refactors PPCFastISel to use more of the common infrastructure for call
lowering (this lets us take advantage of this common code for lowering some
common intrinsics, stackmap/patchpoint among them).
2. Adds support for stackmap/patchpoint lowering. For the most part, this is
very similar to the support in the AArch64 target, with the obvious differences
(different registers, NOP instructions, etc.). The test cases are adapted
from the AArch64 test cases.
One difference of note is that the patchpoint call sequence takes 24 bytes, so
you can't use less than that (on AArch64 you can go down to 16). Also, as noted
in the docs, we take the patchpoint address to be the actual code address
(assuming the call is local in the TOC-sharing sense), which should yield
higher performance than generating the full cross-DSO indirect-call sequence
and is likely just as useful for JITed code (if not, we'll change it).
StackMaps and Patchpoints are still marked as experimental, and so this support
is doubly experimental. So go ahead and experiment!
llvm-svn: 225909
2015-01-14 09:07:51 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Lower a patchpoint of the form:
|
|
|
|
// [<def>], <id>, <numBytes>, <target>, <numArgs>
|
2015-12-12 09:47:08 +08:00
|
|
|
void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) {
|
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support""
This re-applies r225808, fixed to avoid problems with SDAG dependencies along
with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs.
These problems caused the original regression tests to assert/segfault on many
(but not all) systems.
Original commit message:
This commit does two things:
1. Refactors PPCFastISel to use more of the common infrastructure for call
lowering (this lets us take advantage of this common code for lowering some
common intrinsics, stackmap/patchpoint among them).
2. Adds support for stackmap/patchpoint lowering. For the most part, this is
very similar to the support in the AArch64 target, with the obvious differences
(different registers, NOP instructions, etc.). The test cases are adapted
from the AArch64 test cases.
One difference of note is that the patchpoint call sequence takes 24 bytes, so
you can't use less than that (on AArch64 you can go down to 16). Also, as noted
in the docs, we take the patchpoint address to be the actual code address
(assuming the call is local in the TOC-sharing sense), which should yield
higher performance than generating the full cross-DSO indirect-call sequence
and is likely just as useful for JITed code (if not, we'll change it).
StackMaps and Patchpoints are still marked as experimental, and so this support
is doubly experimental. So go ahead and experiment!
llvm-svn: 225909
2015-01-14 09:07:51 +08:00
|
|
|
SM.recordPatchPoint(MI);
|
|
|
|
PatchPointOpers Opers(&MI);
|
|
|
|
|
|
|
|
unsigned EncodedBytes = 0;
|
2016-08-24 07:33:29 +08:00
|
|
|
const MachineOperand &CalleeMO = Opers.getCallTarget();
|
2015-07-15 06:53:11 +08:00
|
|
|
|
|
|
|
if (CalleeMO.isImm()) {
|
2016-08-24 07:33:29 +08:00
|
|
|
int64_t CallTarget = CalleeMO.getImm();
|
2015-07-15 06:53:11 +08:00
|
|
|
if (CallTarget) {
|
|
|
|
assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
|
|
|
|
"High 16 bits of call target should be zero.");
|
|
|
|
unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
|
|
|
|
EncodedBytes = 0;
|
|
|
|
// Materialize the jump address:
|
2015-12-12 09:47:08 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI8)
|
2015-07-15 06:53:11 +08:00
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addImm((CallTarget >> 32) & 0xFFFF));
|
|
|
|
++EncodedBytes;
|
2015-12-12 09:47:08 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::RLDIC)
|
2015-07-15 06:53:11 +08:00
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addImm(32).addImm(16));
|
|
|
|
++EncodedBytes;
|
2015-12-12 09:47:08 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORIS8)
|
2015-07-15 06:53:11 +08:00
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addImm((CallTarget >> 16) & 0xFFFF));
|
|
|
|
++EncodedBytes;
|
2015-12-12 09:47:08 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORI8)
|
2015-07-15 06:53:11 +08:00
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addImm(CallTarget & 0xFFFF));
|
|
|
|
|
|
|
|
// Save the current TOC pointer before the remote call.
|
2017-04-11 06:22:11 +08:00
|
|
|
int TOCSaveOffset = Subtarget->getFrameLowering()->getTOCSaveOffset();
|
2015-12-12 09:47:08 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::STD)
|
2015-07-15 06:26:06 +08:00
|
|
|
.addReg(PPC::X2)
|
2015-07-15 06:53:11 +08:00
|
|
|
.addImm(TOCSaveOffset)
|
|
|
|
.addReg(PPC::X1));
|
|
|
|
++EncodedBytes;
|
|
|
|
|
|
|
|
// If we're on ELFv1, then we need to load the actual function pointer
|
|
|
|
// from the function descriptor.
|
|
|
|
if (!Subtarget->isELFv2ABI()) {
|
2015-09-22 19:14:12 +08:00
|
|
|
// Load the new TOC pointer and the function address, but not r11
|
|
|
|
// (needing this is rare, and loading it here would prevent passing it
|
|
|
|
// via a 'nest' parameter.
|
2015-12-12 09:47:08 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
|
2015-07-15 06:53:11 +08:00
|
|
|
.addReg(PPC::X2)
|
|
|
|
.addImm(8)
|
|
|
|
.addReg(ScratchReg));
|
|
|
|
++EncodedBytes;
|
2015-12-12 09:47:08 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
|
2015-07-15 06:53:11 +08:00
|
|
|
.addReg(ScratchReg)
|
|
|
|
.addImm(0)
|
|
|
|
.addReg(ScratchReg));
|
|
|
|
++EncodedBytes;
|
|
|
|
}
|
|
|
|
|
2015-12-12 09:47:08 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTCTR8)
|
2015-07-15 06:26:06 +08:00
|
|
|
.addReg(ScratchReg));
|
|
|
|
++EncodedBytes;
|
2015-12-12 09:47:08 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BCTRL8));
|
2015-07-15 06:53:11 +08:00
|
|
|
++EncodedBytes;
|
|
|
|
|
|
|
|
// Restore the TOC pointer after the call.
|
2015-12-12 09:47:08 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
|
2015-07-15 06:53:11 +08:00
|
|
|
.addReg(PPC::X2)
|
|
|
|
.addImm(TOCSaveOffset)
|
|
|
|
.addReg(PPC::X1));
|
2015-07-15 06:26:06 +08:00
|
|
|
++EncodedBytes;
|
|
|
|
}
|
2015-07-15 06:53:11 +08:00
|
|
|
} else if (CalleeMO.isGlobal()) {
|
|
|
|
const GlobalValue *GValue = CalleeMO.getGlobal();
|
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
|
|
|
const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, OutContext);
|
|
|
|
|
2015-12-12 09:47:08 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL8_NOP)
|
2015-07-15 06:53:11 +08:00
|
|
|
.addExpr(SymVar));
|
|
|
|
EncodedBytes += 2;
|
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support""
This re-applies r225808, fixed to avoid problems with SDAG dependencies along
with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs.
These problems caused the original regression tests to assert/segfault on many
(but not all) systems.
Original commit message:
This commit does two things:
1. Refactors PPCFastISel to use more of the common infrastructure for call
lowering (this lets us take advantage of this common code for lowering some
common intrinsics, stackmap/patchpoint among them).
2. Adds support for stackmap/patchpoint lowering. For the most part, this is
very similar to the support in the AArch64 target, with the obvious differences
(different registers, NOP instructions, etc.). The test cases are adapted
from the AArch64 test cases.
One difference of note is that the patchpoint call sequence takes 24 bytes, so
you can't use less than that (on AArch64 you can go down to 16). Also, as noted
in the docs, we take the patchpoint address to be the actual code address
(assuming the call is local in the TOC-sharing sense), which should yield
higher performance than generating the full cross-DSO indirect-call sequence
and is likely just as useful for JITed code (if not, we'll change it).
StackMaps and Patchpoints are still marked as experimental, and so this support
is doubly experimental. So go ahead and experiment!
llvm-svn: 225909
2015-01-14 09:07:51 +08:00
|
|
|
}
|
|
|
|
|
2015-07-15 06:26:06 +08:00
|
|
|
// Each instruction is 4 bytes.
|
|
|
|
EncodedBytes *= 4;
|
|
|
|
|
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support""
This re-applies r225808, fixed to avoid problems with SDAG dependencies along
with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs.
These problems caused the original regression tests to assert/segfault on many
(but not all) systems.
Original commit message:
This commit does two things:
1. Refactors PPCFastISel to use more of the common infrastructure for call
lowering (this lets us take advantage of this common code for lowering some
common intrinsics, stackmap/patchpoint among them).
2. Adds support for stackmap/patchpoint lowering. For the most part, this is
very similar to the support in the AArch64 target, with the obvious differences
(different registers, NOP instructions, etc.). The test cases are adapted
from the AArch64 test cases.
One difference of note is that the patchpoint call sequence takes 24 bytes, so
you can't use less than that (on AArch64 you can go down to 16). Also, as noted
in the docs, we take the patchpoint address to be the actual code address
(assuming the call is local in the TOC-sharing sense), which should yield
higher performance than generating the full cross-DSO indirect-call sequence
and is likely just as useful for JITed code (if not, we'll change it).
StackMaps and Patchpoints are still marked as experimental, and so this support
is doubly experimental. So go ahead and experiment!
llvm-svn: 225909
2015-01-14 09:07:51 +08:00
|
|
|
// Emit padding.
|
2016-08-24 07:33:29 +08:00
|
|
|
unsigned NumBytes = Opers.getNumPatchBytes();
|
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support""
This re-applies r225808, fixed to avoid problems with SDAG dependencies along
with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs.
These problems caused the original regression tests to assert/segfault on many
(but not all) systems.
Original commit message:
This commit does two things:
1. Refactors PPCFastISel to use more of the common infrastructure for call
lowering (this lets us take advantage of this common code for lowering some
common intrinsics, stackmap/patchpoint among them).
2. Adds support for stackmap/patchpoint lowering. For the most part, this is
very similar to the support in the AArch64 target, with the obvious differences
(different registers, NOP instructions, etc.). The test cases are adapted
from the AArch64 test cases.
One difference of note is that the patchpoint call sequence takes 24 bytes, so
you can't use less than that (on AArch64 you can go down to 16). Also, as noted
in the docs, we take the patchpoint address to be the actual code address
(assuming the call is local in the TOC-sharing sense), which should yield
higher performance than generating the full cross-DSO indirect-call sequence
and is likely just as useful for JITed code (if not, we'll change it).
StackMaps and Patchpoints are still marked as experimental, and so this support
is doubly experimental. So go ahead and experiment!
llvm-svn: 225909
2015-01-14 09:07:51 +08:00
|
|
|
assert(NumBytes >= EncodedBytes &&
|
|
|
|
"Patchpoint can't request size less than the length of a call.");
|
|
|
|
assert((NumBytes - EncodedBytes) % 4 == 0 &&
|
|
|
|
"Invalid number of NOP bytes requested!");
|
|
|
|
for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
|
2015-12-12 09:47:08 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
|
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support""
This re-applies r225808, fixed to avoid problems with SDAG dependencies along
with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs.
These problems caused the original regression tests to assert/segfault on many
(but not all) systems.
Original commit message:
This commit does two things:
1. Refactors PPCFastISel to use more of the common infrastructure for call
lowering (this lets us take advantage of this common code for lowering some
common intrinsics, stackmap/patchpoint among them).
2. Adds support for stackmap/patchpoint lowering. For the most part, this is
very similar to the support in the AArch64 target, with the obvious differences
(different registers, NOP instructions, etc.). The test cases are adapted
from the AArch64 test cases.
One difference of note is that the patchpoint call sequence takes 24 bytes, so
you can't use less than that (on AArch64 you can go down to 16). Also, as noted
in the docs, we take the patchpoint address to be the actual code address
(assuming the call is local in the TOC-sharing sense), which should yield
higher performance than generating the full cross-DSO indirect-call sequence
and is likely just as useful for JITed code (if not, we'll change it).
StackMaps and Patchpoints are still marked as experimental, and so this support
is doubly experimental. So go ahead and experiment!
llvm-svn: 225909
2015-01-14 09:07:51 +08:00
|
|
|
}
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
2015-02-11 03:09:05 +08:00
|
|
|
/// EmitTlsCall -- Given a GETtls[ld]ADDR[32] instruction, print a
|
|
|
|
/// call to __tls_get_addr to the current output stream.
|
|
|
|
void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
|
|
|
|
MCSymbolRefExpr::VariantKind VK) {
|
|
|
|
StringRef Name = "__tls_get_addr";
|
2015-05-19 02:43:14 +08:00
|
|
|
MCSymbol *TlsGetAddr = OutContext.getOrCreateSymbol(Name);
|
2015-02-11 03:09:05 +08:00
|
|
|
MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
|
|
|
|
|
|
|
|
assert(MI->getOperand(0).isReg() &&
|
2015-02-11 03:31:55 +08:00
|
|
|
((Subtarget->isPPC64() && MI->getOperand(0).getReg() == PPC::X3) ||
|
|
|
|
(!Subtarget->isPPC64() && MI->getOperand(0).getReg() == PPC::R3)) &&
|
2015-02-11 03:09:05 +08:00
|
|
|
"GETtls[ld]ADDR[32] must define GPR3");
|
|
|
|
assert(MI->getOperand(1).isReg() &&
|
2015-02-11 03:31:55 +08:00
|
|
|
((Subtarget->isPPC64() && MI->getOperand(1).getReg() == PPC::X3) ||
|
|
|
|
(!Subtarget->isPPC64() && MI->getOperand(1).getReg() == PPC::R3)) &&
|
2015-02-11 03:09:05 +08:00
|
|
|
"GETtls[ld]ADDR[32] must read GPR3");
|
|
|
|
|
2015-02-11 03:31:55 +08:00
|
|
|
if (!Subtarget->isPPC64() && !Subtarget->isDarwin() &&
|
2016-06-27 22:19:45 +08:00
|
|
|
isPositionIndependent())
|
2015-02-11 03:09:05 +08:00
|
|
|
Kind = MCSymbolRefExpr::VK_PLT;
|
|
|
|
const MCSymbolRefExpr *TlsRef =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(TlsGetAddr, Kind, OutContext);
|
2015-02-11 03:09:05 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
2015-05-30 09:25:56 +08:00
|
|
|
const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, VK, OutContext);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer,
|
2015-02-11 03:31:55 +08:00
|
|
|
MCInstBuilder(Subtarget->isPPC64() ?
|
2015-02-11 03:09:05 +08:00
|
|
|
PPC::BL8_NOP_TLS : PPC::BL_TLS)
|
|
|
|
.addExpr(TlsRef)
|
|
|
|
.addExpr(SymVar));
|
|
|
|
}
|
|
|
|
|
2010-01-28 09:28:58 +08:00
|
|
|
/// EmitInstruction -- Print out a single PowerPC MI in Darwin syntax to
|
2004-08-15 06:09:10 +08:00
|
|
|
/// the current output stream.
|
2004-06-22 00:55:25 +08:00
|
|
|
///
|
2010-01-28 09:28:58 +08:00
|
|
|
void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
|
2010-11-15 11:39:06 +08:00
|
|
|
MCInst TmpInst;
|
2015-02-10 08:44:17 +08:00
|
|
|
bool isPPC64 = Subtarget->isPPC64();
|
2015-06-16 23:44:21 +08:00
|
|
|
bool isDarwin = TM.getTargetTriple().isOSDarwin();
|
2014-11-12 23:16:30 +08:00
|
|
|
const Module *M = MF->getFunction()->getParent();
|
|
|
|
PICLevel::Level PL = M->getPICLevel();
|
2015-09-22 19:14:39 +08:00
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
// Lower multi-instruction pseudo operations.
|
|
|
|
switch (MI->getOpcode()) {
|
|
|
|
default: break;
|
2013-06-17 04:34:27 +08:00
|
|
|
case TargetOpcode::DBG_VALUE:
|
|
|
|
llvm_unreachable("Should be handled target independently");
|
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support""
This re-applies r225808, fixed to avoid problems with SDAG dependencies along
with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs.
These problems caused the original regression tests to assert/segfault on many
(but not all) systems.
Original commit message:
This commit does two things:
1. Refactors PPCFastISel to use more of the common infrastructure for call
lowering (this lets us take advantage of this common code for lowering some
common intrinsics, stackmap/patchpoint among them).
2. Adds support for stackmap/patchpoint lowering. For the most part, this is
very similar to the support in the AArch64 target, with the obvious differences
(different registers, NOP instructions, etc.). The test cases are adapted
from the AArch64 test cases.
One difference of note is that the patchpoint call sequence takes 24 bytes, so
you can't use less than that (on AArch64 you can go down to 16). Also, as noted
in the docs, we take the patchpoint address to be the actual code address
(assuming the call is local in the TOC-sharing sense), which should yield
higher performance than generating the full cross-DSO indirect-call sequence
and is likely just as useful for JITed code (if not, we'll change it).
StackMaps and Patchpoints are still marked as experimental, and so this support
is doubly experimental. So go ahead and experiment!
llvm-svn: 225909
2015-01-14 09:07:51 +08:00
|
|
|
case TargetOpcode::STACKMAP:
|
2015-12-12 09:47:08 +08:00
|
|
|
return LowerSTACKMAP(SM, *MI);
|
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support""
This re-applies r225808, fixed to avoid problems with SDAG dependencies along
with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs.
These problems caused the original regression tests to assert/segfault on many
(but not all) systems.
Original commit message:
This commit does two things:
1. Refactors PPCFastISel to use more of the common infrastructure for call
lowering (this lets us take advantage of this common code for lowering some
common intrinsics, stackmap/patchpoint among them).
2. Adds support for stackmap/patchpoint lowering. For the most part, this is
very similar to the support in the AArch64 target, with the obvious differences
(different registers, NOP instructions, etc.). The test cases are adapted
from the AArch64 test cases.
One difference of note is that the patchpoint call sequence takes 24 bytes, so
you can't use less than that (on AArch64 you can go down to 16). Also, as noted
in the docs, we take the patchpoint address to be the actual code address
(assuming the call is local in the TOC-sharing sense), which should yield
higher performance than generating the full cross-DSO indirect-call sequence
and is likely just as useful for JITed code (if not, we'll change it).
StackMaps and Patchpoints are still marked as experimental, and so this support
is doubly experimental. So go ahead and experiment!
llvm-svn: 225909
2015-01-14 09:07:51 +08:00
|
|
|
case TargetOpcode::PATCHPOINT:
|
2015-12-12 09:47:08 +08:00
|
|
|
return LowerPATCHPOINT(SM, *MI);
|
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support""
This re-applies r225808, fixed to avoid problems with SDAG dependencies along
with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs.
These problems caused the original regression tests to assert/segfault on many
(but not all) systems.
Original commit message:
This commit does two things:
1. Refactors PPCFastISel to use more of the common infrastructure for call
lowering (this lets us take advantage of this common code for lowering some
common intrinsics, stackmap/patchpoint among them).
2. Adds support for stackmap/patchpoint lowering. For the most part, this is
very similar to the support in the AArch64 target, with the obvious differences
(different registers, NOP instructions, etc.). The test cases are adapted
from the AArch64 test cases.
One difference of note is that the patchpoint call sequence takes 24 bytes, so
you can't use less than that (on AArch64 you can go down to 16). Also, as noted
in the docs, we take the patchpoint address to be the actual code address
(assuming the call is local in the TOC-sharing sense), which should yield
higher performance than generating the full cross-DSO indirect-call sequence
and is likely just as useful for JITed code (if not, we'll change it).
StackMaps and Patchpoints are still marked as experimental, and so this support
is doubly experimental. So go ahead and experiment!
llvm-svn: 225909
2015-01-14 09:07:51 +08:00
|
|
|
|
2014-11-12 23:16:30 +08:00
|
|
|
case PPC::MoveGOTtoLR: {
|
|
|
|
// Transform %LR = MoveGOTtoLR
|
|
|
|
// Into this: bl _GLOBAL_OFFSET_TABLE_@local-4
|
|
|
|
// _GLOBAL_OFFSET_TABLE_@local-4 (instruction preceding
|
|
|
|
// _GLOBAL_OFFSET_TABLE_) has exactly one instruction:
|
|
|
|
// blrl
|
|
|
|
// This will return the pointer to _GLOBAL_OFFSET_TABLE_@local
|
|
|
|
MCSymbol *GOTSymbol =
|
2015-05-19 02:43:14 +08:00
|
|
|
OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
|
2014-11-12 23:16:30 +08:00
|
|
|
const MCExpr *OffsExpr =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCBinaryExpr::createSub(MCSymbolRefExpr::create(GOTSymbol,
|
2014-11-12 23:16:30 +08:00
|
|
|
MCSymbolRefExpr::VK_PPC_LOCAL,
|
|
|
|
OutContext),
|
2015-05-30 09:25:56 +08:00
|
|
|
MCConstantExpr::create(4, OutContext),
|
2014-11-12 23:16:30 +08:00
|
|
|
OutContext);
|
|
|
|
|
|
|
|
// Emit the 'bl'.
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL).addExpr(OffsExpr));
|
2014-11-12 23:16:30 +08:00
|
|
|
return;
|
|
|
|
}
|
2010-11-15 11:39:06 +08:00
|
|
|
case PPC::MovePCtoLR:
|
|
|
|
case PPC::MovePCtoLR8: {
|
|
|
|
// Transform %LR = MovePCtoLR
|
2015-09-22 19:14:12 +08:00
|
|
|
// Into this, where the label is the PIC base:
|
2010-11-15 11:39:06 +08:00
|
|
|
// bl L1$pb
|
|
|
|
// L1$pb:
|
|
|
|
MCSymbol *PICBase = MF->getPICBaseSymbol();
|
2015-09-22 19:14:39 +08:00
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
// Emit the 'bl'.
|
2015-09-22 19:13:55 +08:00
|
|
|
EmitToStreamer(*OutStreamer,
|
|
|
|
MCInstBuilder(PPC::BL)
|
|
|
|
// FIXME: We would like an efficient form for this, so we
|
|
|
|
// don't have to do a lot of extra uniquing.
|
|
|
|
.addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
|
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
// Emit the label.
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitLabel(PICBase);
|
2010-11-15 11:39:06 +08:00
|
|
|
return;
|
|
|
|
}
|
2014-11-12 23:16:30 +08:00
|
|
|
case PPC::UpdateGBR: {
|
|
|
|
// Transform %Rd = UpdateGBR(%Rt, %Ri)
|
|
|
|
// Into: lwz %Rt, .L0$poff - .L0$pb(%Ri)
|
|
|
|
// add %Rd, %Rt, %Ri
|
2014-07-19 07:29:49 +08:00
|
|
|
// Get the offset from the GOT Base Register to the GOT
|
2014-11-12 23:16:30 +08:00
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
|
|
|
|
MCSymbol *PICOffset =
|
|
|
|
MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol();
|
2014-07-19 07:29:49 +08:00
|
|
|
TmpInst.setOpcode(PPC::LWZ);
|
|
|
|
const MCExpr *Exp =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(PICOffset, MCSymbolRefExpr::VK_None, OutContext);
|
2014-07-19 07:29:49 +08:00
|
|
|
const MCExpr *PB =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(MF->getPICBaseSymbol(),
|
2014-07-19 07:29:49 +08:00
|
|
|
MCSymbolRefExpr::VK_None,
|
|
|
|
OutContext);
|
2014-11-12 23:16:30 +08:00
|
|
|
const MCOperand TR = TmpInst.getOperand(1);
|
|
|
|
const MCOperand PICR = TmpInst.getOperand(0);
|
|
|
|
|
|
|
|
// Step 1: lwz %Rt, .L$poff - .L$pb(%Ri)
|
|
|
|
TmpInst.getOperand(1) =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCOperand::createExpr(MCBinaryExpr::createSub(Exp, PB, OutContext));
|
2014-11-12 23:16:30 +08:00
|
|
|
TmpInst.getOperand(0) = TR;
|
|
|
|
TmpInst.getOperand(2) = PICR;
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, TmpInst);
|
2014-11-12 23:16:30 +08:00
|
|
|
|
2014-07-19 07:29:49 +08:00
|
|
|
TmpInst.setOpcode(PPC::ADD4);
|
2014-11-12 23:16:30 +08:00
|
|
|
TmpInst.getOperand(0) = PICR;
|
|
|
|
TmpInst.getOperand(1) = TR;
|
|
|
|
TmpInst.getOperand(2) = PICR;
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, TmpInst);
|
2014-07-19 07:29:49 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
case PPC::LWZtoc: {
|
2014-11-12 23:16:30 +08:00
|
|
|
// Transform %R3 = LWZtoc <ga:@min1>, %R2
|
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
|
2014-07-19 07:29:49 +08:00
|
|
|
|
|
|
|
// Change the opcode to LWZ, and the global address operand to be a
|
|
|
|
// reference to the GOT entry we will synthesize later.
|
|
|
|
TmpInst.setOpcode(PPC::LWZ);
|
|
|
|
const MachineOperand &MO = MI->getOperand(1);
|
|
|
|
|
|
|
|
// Map symbol -> label of TOC entry
|
2014-10-31 18:33:14 +08:00
|
|
|
assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress());
|
2014-07-19 07:29:49 +08:00
|
|
|
MCSymbol *MOSymbol = nullptr;
|
|
|
|
if (MO.isGlobal())
|
|
|
|
MOSymbol = getSymbol(MO.getGlobal());
|
|
|
|
else if (MO.isCPI())
|
|
|
|
MOSymbol = GetCPISymbol(MO.getIndex());
|
|
|
|
else if (MO.isJTI())
|
|
|
|
MOSymbol = GetJTISymbol(MO.getIndex());
|
2014-10-31 18:33:14 +08:00
|
|
|
else if (MO.isBlockAddress())
|
|
|
|
MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
|
2014-07-19 07:29:49 +08:00
|
|
|
|
2016-06-18 02:07:14 +08:00
|
|
|
if (PL == PICLevel::SmallPIC) {
|
2014-11-12 23:16:30 +08:00
|
|
|
const MCExpr *Exp =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_GOT,
|
2014-11-12 23:16:30 +08:00
|
|
|
OutContext);
|
2015-05-14 02:37:00 +08:00
|
|
|
TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
|
2014-11-12 23:16:30 +08:00
|
|
|
} else {
|
|
|
|
MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
|
|
|
|
|
|
|
|
const MCExpr *Exp =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_None,
|
2014-11-12 23:16:30 +08:00
|
|
|
OutContext);
|
|
|
|
const MCExpr *PB =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(OutContext.getOrCreateSymbol(Twine(".LTOC")),
|
2014-11-12 23:16:30 +08:00
|
|
|
OutContext);
|
2015-05-30 09:25:56 +08:00
|
|
|
Exp = MCBinaryExpr::createSub(Exp, PB, OutContext);
|
2015-05-14 02:37:00 +08:00
|
|
|
TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
|
2014-11-12 23:16:30 +08:00
|
|
|
}
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, TmpInst);
|
2014-07-19 07:29:49 +08:00
|
|
|
return;
|
|
|
|
}
|
2012-08-25 00:26:02 +08:00
|
|
|
case PPC::LDtocJTI:
|
|
|
|
case PPC::LDtocCPT:
|
2014-10-31 18:33:14 +08:00
|
|
|
case PPC::LDtocBA:
|
2010-11-15 11:39:06 +08:00
|
|
|
case PPC::LDtoc: {
|
|
|
|
// Transform %X3 = LDtoc <ga:@min1>, %X2
|
2014-11-12 23:16:30 +08:00
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
|
2012-08-25 00:26:02 +08:00
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
// Change the opcode to LD, and the global address operand to be a
|
|
|
|
// reference to the TOC entry we will synthesize later.
|
|
|
|
TmpInst.setOpcode(PPC::LD);
|
|
|
|
const MachineOperand &MO = MI->getOperand(1);
|
2012-08-25 00:26:02 +08:00
|
|
|
|
|
|
|
// Map symbol -> label of TOC entry
|
2014-10-31 18:33:14 +08:00
|
|
|
assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress());
|
2014-04-25 13:30:21 +08:00
|
|
|
MCSymbol *MOSymbol = nullptr;
|
2012-08-25 00:26:02 +08:00
|
|
|
if (MO.isGlobal())
|
2013-10-30 01:07:16 +08:00
|
|
|
MOSymbol = getSymbol(MO.getGlobal());
|
2012-08-25 00:26:02 +08:00
|
|
|
else if (MO.isCPI())
|
|
|
|
MOSymbol = GetCPISymbol(MO.getIndex());
|
|
|
|
else if (MO.isJTI())
|
|
|
|
MOSymbol = GetJTISymbol(MO.getIndex());
|
2014-10-31 18:33:14 +08:00
|
|
|
else if (MO.isBlockAddress())
|
|
|
|
MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
|
|
|
MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
|
2012-09-19 01:10:37 +08:00
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
const MCExpr *Exp =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC,
|
2010-11-15 11:39:06 +08:00
|
|
|
OutContext);
|
2015-05-14 02:37:00 +08:00
|
|
|
TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, TmpInst);
|
2010-11-15 03:53:02 +08:00
|
|
|
return;
|
|
|
|
}
|
2015-09-22 19:14:39 +08:00
|
|
|
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
case PPC::ADDIStocHA: {
|
|
|
|
// Transform %Xd = ADDIStocHA %X2, <ga:@sym>
|
2014-11-12 23:16:30 +08:00
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
2014-06-17 05:36:02 +08:00
|
|
|
// Change the opcode to ADDIS8. If the global address is external, has
|
|
|
|
// common linkage, is a non-local function address, or is a jump table
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
// address, then generate a TOC entry and reference that. Otherwise
|
|
|
|
// reference the symbol directly.
|
|
|
|
TmpInst.setOpcode(PPC::ADDIS8);
|
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
2014-10-31 18:33:14 +08:00
|
|
|
assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() ||
|
|
|
|
MO.isBlockAddress()) &&
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
"Invalid operand for ADDIStocHA!");
|
2014-04-25 13:30:21 +08:00
|
|
|
MCSymbol *MOSymbol = nullptr;
|
2015-11-21 04:51:31 +08:00
|
|
|
bool GlobalToc = false;
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
|
|
|
if (MO.isGlobal()) {
|
2014-05-29 23:41:38 +08:00
|
|
|
const GlobalValue *GV = MO.getGlobal();
|
|
|
|
MOSymbol = getSymbol(GV);
|
2015-11-21 04:51:31 +08:00
|
|
|
unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
|
|
|
|
GlobalToc = (GVFlags & PPCII::MO_NLP_FLAG);
|
|
|
|
} else if (MO.isCPI()) {
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
MOSymbol = GetCPISymbol(MO.getIndex());
|
2015-11-21 04:51:31 +08:00
|
|
|
} else if (MO.isJTI()) {
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
MOSymbol = GetJTISymbol(MO.getIndex());
|
2015-11-21 04:51:31 +08:00
|
|
|
} else if (MO.isBlockAddress()) {
|
2014-10-31 18:33:14 +08:00
|
|
|
MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
|
2015-11-21 04:51:31 +08:00
|
|
|
}
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
2015-11-21 04:51:31 +08:00
|
|
|
if (GlobalToc || MO.isJTI() || MO.isBlockAddress() ||
|
2014-10-31 18:33:14 +08:00
|
|
|
TM.getCodeModel() == CodeModel::Large)
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
|
|
|
|
|
|
|
|
const MCExpr *Exp =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_HA,
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
OutContext);
|
2016-09-03 05:37:07 +08:00
|
|
|
|
|
|
|
if (!MO.isJTI() && MO.getOffset())
|
|
|
|
Exp = MCBinaryExpr::createAdd(Exp,
|
|
|
|
MCConstantExpr::create(MO.getOffset(),
|
|
|
|
OutContext),
|
|
|
|
OutContext);
|
|
|
|
|
2015-05-14 02:37:00 +08:00
|
|
|
TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, TmpInst);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
case PPC::LDtocL: {
|
|
|
|
// Transform %Xd = LDtocL <ga:@sym>, %Xs
|
2014-11-12 23:16:30 +08:00
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
2013-03-26 18:55:45 +08:00
|
|
|
// Change the opcode to LD. If the global address is external, has
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
// common linkage, or is a jump table address, then reference the
|
|
|
|
// associated TOC entry. Otherwise reference the symbol directly.
|
2013-03-26 18:55:45 +08:00
|
|
|
TmpInst.setOpcode(PPC::LD);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(1);
|
2014-10-31 18:33:14 +08:00
|
|
|
assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() ||
|
|
|
|
MO.isBlockAddress()) &&
|
2013-02-22 01:12:27 +08:00
|
|
|
"Invalid operand for LDtocL!");
|
2014-04-25 13:30:21 +08:00
|
|
|
MCSymbol *MOSymbol = nullptr;
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
|
|
|
if (MO.isJTI())
|
|
|
|
MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
|
2014-10-31 18:33:14 +08:00
|
|
|
else if (MO.isBlockAddress()) {
|
|
|
|
MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
|
|
|
|
MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
|
|
|
|
}
|
2013-09-18 04:03:25 +08:00
|
|
|
else if (MO.isCPI()) {
|
2013-02-22 01:12:27 +08:00
|
|
|
MOSymbol = GetCPISymbol(MO.getIndex());
|
2013-09-18 04:03:25 +08:00
|
|
|
if (TM.getCodeModel() == CodeModel::Large)
|
|
|
|
MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
|
|
|
|
}
|
2013-02-22 01:12:27 +08:00
|
|
|
else if (MO.isGlobal()) {
|
2015-11-21 04:51:31 +08:00
|
|
|
const GlobalValue *GV = MO.getGlobal();
|
|
|
|
MOSymbol = getSymbol(GV);
|
|
|
|
DEBUG(
|
|
|
|
unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
|
|
|
|
assert((GVFlags & PPCII::MO_NLP_FLAG) &&
|
|
|
|
"LDtocL used on symbol that could be accessed directly is "
|
|
|
|
"invalid. Must match ADDIStocHA."));
|
|
|
|
MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
const MCExpr *Exp =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
OutContext);
|
2015-05-14 02:37:00 +08:00
|
|
|
TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, TmpInst);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
case PPC::ADDItocL: {
|
|
|
|
// Transform %Xd = ADDItocL %Xs, <ga:@sym>
|
2014-11-12 23:16:30 +08:00
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
2013-03-26 18:55:20 +08:00
|
|
|
// Change the opcode to ADDI8. If the global address is external, then
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
// generate a TOC entry and reference that. Otherwise reference the
|
|
|
|
// symbol directly.
|
2013-03-26 18:55:20 +08:00
|
|
|
TmpInst.setOpcode(PPC::ADDI8);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL");
|
2014-04-25 13:30:21 +08:00
|
|
|
MCSymbol *MOSymbol = nullptr;
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
|
|
|
if (MO.isGlobal()) {
|
2014-05-29 23:41:38 +08:00
|
|
|
const GlobalValue *GV = MO.getGlobal();
|
2015-11-21 04:51:31 +08:00
|
|
|
DEBUG(
|
|
|
|
unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
|
|
|
|
assert (
|
|
|
|
!(GVFlags & PPCII::MO_NLP_FLAG) &&
|
|
|
|
"Interposable definitions must use indirect access."));
|
2014-05-29 23:41:38 +08:00
|
|
|
MOSymbol = getSymbol(GV);
|
2015-11-21 04:51:31 +08:00
|
|
|
} else if (MO.isCPI()) {
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
MOSymbol = GetCPISymbol(MO.getIndex());
|
2015-11-21 04:51:31 +08:00
|
|
|
}
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
|
|
|
const MCExpr *Exp =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
OutContext);
|
2015-05-14 02:37:00 +08:00
|
|
|
TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, TmpInst);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
return;
|
|
|
|
}
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
case PPC::ADDISgotTprelHA: {
|
|
|
|
// Transform: %Xd = ADDISgotTprelHA %X2, <ga:@sym>
|
|
|
|
// Into: %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
|
2015-02-10 08:44:17 +08:00
|
|
|
assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
const MCExpr *SymGotTprel =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
OutContext);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(MI->getOperand(1).getReg())
|
|
|
|
.addExpr(SymGotTprel));
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
return;
|
|
|
|
}
|
2013-12-21 02:08:54 +08:00
|
|
|
case PPC::LDgotTprelL:
|
|
|
|
case PPC::LDgotTprelL32: {
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
// Transform %Xd = LDgotTprelL <ga:@sym>, %Xs
|
2014-11-12 23:16:30 +08:00
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
|
2012-12-05 00:18:08 +08:00
|
|
|
|
2013-03-26 18:55:45 +08:00
|
|
|
// Change the opcode to LD.
|
2013-12-21 02:08:54 +08:00
|
|
|
TmpInst.setOpcode(isPPC64 ? PPC::LD : PPC::LWZ);
|
2012-12-05 00:18:08 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(1);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
2012-12-05 00:18:08 +08:00
|
|
|
const MCExpr *Exp =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO,
|
2012-12-05 00:18:08 +08:00
|
|
|
OutContext);
|
2015-05-14 02:37:00 +08:00
|
|
|
TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, TmpInst);
|
2012-12-05 00:18:08 +08:00
|
|
|
return;
|
|
|
|
}
|
2013-12-21 02:08:54 +08:00
|
|
|
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::PPC32PICGOT: {
|
2015-05-19 02:43:14 +08:00
|
|
|
MCSymbol *GOTSymbol = OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
|
|
|
|
MCSymbol *GOTRef = OutContext.createTempSymbol();
|
|
|
|
MCSymbol *NextInstr = OutContext.createTempSymbol();
|
2014-07-26 01:47:22 +08:00
|
|
|
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL)
|
2014-07-26 01:47:22 +08:00
|
|
|
// FIXME: We would like an efficient form for this, so we don't have to do
|
|
|
|
// a lot of extra uniquing.
|
2015-05-30 09:25:56 +08:00
|
|
|
.addExpr(MCSymbolRefExpr::create(NextInstr, OutContext)));
|
2014-07-26 01:47:22 +08:00
|
|
|
const MCExpr *OffsExpr =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCBinaryExpr::createSub(MCSymbolRefExpr::create(GOTSymbol, OutContext),
|
|
|
|
MCSymbolRefExpr::create(GOTRef, OutContext),
|
2014-07-26 01:47:22 +08:00
|
|
|
OutContext);
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitLabel(GOTRef);
|
|
|
|
OutStreamer->EmitValue(OffsExpr, 4);
|
|
|
|
OutStreamer->EmitLabel(NextInstr);
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR)
|
|
|
|
.addReg(MI->getOperand(0).getReg()));
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LWZ)
|
|
|
|
.addReg(MI->getOperand(1).getReg())
|
|
|
|
.addImm(0)
|
|
|
|
.addReg(MI->getOperand(0).getReg()));
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADD4)
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(MI->getOperand(1).getReg())
|
|
|
|
.addReg(MI->getOperand(0).getReg()));
|
2014-07-26 01:47:22 +08:00
|
|
|
return;
|
|
|
|
}
|
2013-12-21 02:08:54 +08:00
|
|
|
case PPC::PPC32GOT: {
|
2015-09-22 19:13:55 +08:00
|
|
|
MCSymbol *GOTSymbol =
|
|
|
|
OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
|
|
|
|
const MCExpr *SymGotTlsL = MCSymbolRefExpr::create(
|
|
|
|
GOTSymbol, MCSymbolRefExpr::VK_PPC_LO, OutContext);
|
|
|
|
const MCExpr *SymGotTlsHA = MCSymbolRefExpr::create(
|
|
|
|
GOTSymbol, MCSymbolRefExpr::VK_PPC_HA, OutContext);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI)
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addExpr(SymGotTlsL));
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addExpr(SymGotTlsHA));
|
2013-12-21 02:08:54 +08:00
|
|
|
return;
|
|
|
|
}
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
case PPC::ADDIStlsgdHA: {
|
|
|
|
// Transform: %Xd = ADDIStlsgdHA %X2, <ga:@sym>
|
|
|
|
// Into: %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
|
2015-02-10 08:44:17 +08:00
|
|
|
assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
const MCExpr *SymGotTlsGD =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA,
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
OutContext);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(MI->getOperand(1).getReg())
|
|
|
|
.addExpr(SymGotTlsGD));
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
return;
|
|
|
|
}
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDItlsgdL:
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
// Transform: %Xd = ADDItlsgdL %Xs, <ga:@sym>
|
2013-03-26 18:55:20 +08:00
|
|
|
// Into: %Xd = ADDI8 %Xs, sym@got@tlsgd@l
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDItlsgdL32: {
|
|
|
|
// Transform: %Rd = ADDItlsgdL32 %Rs, <ga:@sym>
|
|
|
|
// Into: %Rd = ADDI %Rs, sym@got@tlsgd
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
2015-05-30 09:25:56 +08:00
|
|
|
const MCExpr *SymGotTlsGD = MCSymbolRefExpr::create(
|
2015-02-10 08:44:17 +08:00
|
|
|
MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO
|
|
|
|
: MCSymbolRefExpr::VK_PPC_GOT_TLSGD,
|
|
|
|
OutContext);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer,
|
2015-02-10 08:44:17 +08:00
|
|
|
MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
|
2014-07-26 01:47:22 +08:00
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(MI->getOperand(1).getReg())
|
|
|
|
.addExpr(SymGotTlsGD));
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
return;
|
|
|
|
}
|
2015-02-11 03:09:05 +08:00
|
|
|
case PPC::GETtlsADDR:
|
|
|
|
// Transform: %X3 = GETtlsADDR %X3, <ga:@sym>
|
|
|
|
// Into: BL8_NOP_TLS __tls_get_addr(sym at tlsgd)
|
|
|
|
case PPC::GETtlsADDR32: {
|
|
|
|
// Transform: %R3 = GETtlsADDR32 %R3, <ga:@sym>
|
|
|
|
// Into: BL_TLS __tls_get_addr(sym at tlsgd)@PLT
|
|
|
|
EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSGD);
|
|
|
|
return;
|
|
|
|
}
|
2012-12-13 03:29:35 +08:00
|
|
|
case PPC::ADDIStlsldHA: {
|
|
|
|
// Transform: %Xd = ADDIStlsldHA %X2, <ga:@sym>
|
|
|
|
// Into: %Xd = ADDIS8 %X2, sym@got@tlsld@ha
|
2015-02-10 08:44:17 +08:00
|
|
|
assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
|
2012-12-13 03:29:35 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
2012-12-13 03:29:35 +08:00
|
|
|
const MCExpr *SymGotTlsLD =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA,
|
2012-12-13 03:29:35 +08:00
|
|
|
OutContext);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(MI->getOperand(1).getReg())
|
|
|
|
.addExpr(SymGotTlsLD));
|
2012-12-13 03:29:35 +08:00
|
|
|
return;
|
|
|
|
}
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDItlsldL:
|
2012-12-13 03:29:35 +08:00
|
|
|
// Transform: %Xd = ADDItlsldL %Xs, <ga:@sym>
|
2013-03-26 18:55:20 +08:00
|
|
|
// Into: %Xd = ADDI8 %Xs, sym@got@tlsld@l
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDItlsldL32: {
|
|
|
|
// Transform: %Rd = ADDItlsldL32 %Rs, <ga:@sym>
|
|
|
|
// Into: %Rd = ADDI %Rs, sym@got@tlsld
|
2012-12-13 03:29:35 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
2015-05-30 09:25:56 +08:00
|
|
|
const MCExpr *SymGotTlsLD = MCSymbolRefExpr::create(
|
2015-02-10 08:44:17 +08:00
|
|
|
MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO
|
|
|
|
: MCSymbolRefExpr::VK_PPC_GOT_TLSLD,
|
|
|
|
OutContext);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer,
|
2015-02-10 08:44:17 +08:00
|
|
|
MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(MI->getOperand(1).getReg())
|
|
|
|
.addExpr(SymGotTlsLD));
|
2012-12-13 03:29:35 +08:00
|
|
|
return;
|
|
|
|
}
|
2015-02-11 03:09:05 +08:00
|
|
|
case PPC::GETtlsldADDR:
|
|
|
|
// Transform: %X3 = GETtlsldADDR %X3, <ga:@sym>
|
|
|
|
// Into: BL8_NOP_TLS __tls_get_addr(sym at tlsld)
|
|
|
|
case PPC::GETtlsldADDR32: {
|
|
|
|
// Transform: %R3 = GETtlsldADDR32 %R3, <ga:@sym>
|
|
|
|
// Into: BL_TLS __tls_get_addr(sym at tlsld)@PLT
|
|
|
|
EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSLD);
|
|
|
|
return;
|
|
|
|
}
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDISdtprelHA:
|
2015-08-30 15:44:05 +08:00
|
|
|
// Transform: %Xd = ADDISdtprelHA %Xs, <ga:@sym>
|
|
|
|
// Into: %Xd = ADDIS8 %Xs, sym@dtprel@ha
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDISdtprelHA32: {
|
2015-08-30 15:44:05 +08:00
|
|
|
// Transform: %Rd = ADDISdtprelHA32 %Rs, <ga:@sym>
|
|
|
|
// Into: %Rd = ADDIS %Rs, sym@dtprel@ha
|
2012-12-13 03:29:35 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
2012-12-13 03:29:35 +08:00
|
|
|
const MCExpr *SymDtprel =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA,
|
2012-12-13 03:29:35 +08:00
|
|
|
OutContext);
|
2015-02-10 08:44:17 +08:00
|
|
|
EmitToStreamer(
|
2015-04-25 03:11:51 +08:00
|
|
|
*OutStreamer,
|
2015-02-10 08:44:17 +08:00
|
|
|
MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDIS8 : PPC::ADDIS)
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
2015-08-30 15:44:05 +08:00
|
|
|
.addReg(MI->getOperand(1).getReg())
|
2015-02-10 08:44:17 +08:00
|
|
|
.addExpr(SymDtprel));
|
2012-12-13 03:29:35 +08:00
|
|
|
return;
|
|
|
|
}
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDIdtprelL:
|
2012-12-13 03:29:35 +08:00
|
|
|
// Transform: %Xd = ADDIdtprelL %Xs, <ga:@sym>
|
2013-03-26 18:55:20 +08:00
|
|
|
// Into: %Xd = ADDI8 %Xs, sym@dtprel@l
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDIdtprelL32: {
|
|
|
|
// Transform: %Rd = ADDIdtprelL32 %Rs, <ga:@sym>
|
|
|
|
// Into: %Rd = ADDI %Rs, sym@dtprel@l
|
2012-12-13 03:29:35 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
2012-12-13 03:29:35 +08:00
|
|
|
const MCExpr *SymDtprel =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
|
2012-12-13 03:29:35 +08:00
|
|
|
OutContext);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer,
|
2015-02-10 08:44:17 +08:00
|
|
|
MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(MI->getOperand(1).getReg())
|
|
|
|
.addExpr(SymDtprel));
|
2012-12-13 03:29:35 +08:00
|
|
|
return;
|
|
|
|
}
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
llvm-svn: 185556
2013-07-04 01:05:42 +08:00
|
|
|
case PPC::MFOCRF:
|
|
|
|
case PPC::MFOCRF8:
|
2015-02-10 08:44:17 +08:00
|
|
|
if (!Subtarget->hasMFOCRF()) {
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
llvm-svn: 185556
2013-07-04 01:05:42 +08:00
|
|
|
// Transform: %R3 = MFOCRF %CR7
|
|
|
|
// Into: %R3 = MFCR ;; cr7
|
|
|
|
unsigned NewOpcode =
|
|
|
|
MI->getOpcode() == PPC::MFOCRF ? PPC::MFCR : PPC::MFCR8;
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->AddComment(PPCInstPrinter::
|
|
|
|
getRegisterName(MI->getOperand(1).getReg()));
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(NewOpcode)
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
llvm-svn: 185556
2013-07-04 01:05:42 +08:00
|
|
|
.addReg(MI->getOperand(0).getReg()));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
break;
|
2013-07-04 01:59:07 +08:00
|
|
|
case PPC::MTOCRF:
|
|
|
|
case PPC::MTOCRF8:
|
2015-02-10 08:44:17 +08:00
|
|
|
if (!Subtarget->hasMFOCRF()) {
|
2013-07-04 01:59:07 +08:00
|
|
|
// Transform: %CR7 = MTOCRF %R3
|
|
|
|
// Into: MTCRF mask, %R3 ;; cr7
|
|
|
|
unsigned NewOpcode =
|
|
|
|
MI->getOpcode() == PPC::MTOCRF ? PPC::MTCRF : PPC::MTCRF8;
|
|
|
|
unsigned Mask = 0x80 >> OutContext.getRegisterInfo()
|
|
|
|
->getEncodingValue(MI->getOperand(0).getReg());
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->AddComment(PPCInstPrinter::
|
|
|
|
getRegisterName(MI->getOperand(0).getReg()));
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(NewOpcode)
|
|
|
|
.addImm(Mask)
|
|
|
|
.addReg(MI->getOperand(1).getReg()));
|
2013-07-04 01:59:07 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
break;
|
Index: test/CodeGen/PowerPC/reloc-align.ll
===================================================================
--- test/CodeGen/PowerPC/reloc-align.ll (revision 0)
+++ test/CodeGen/PowerPC/reloc-align.ll (revision 0)
@@ -0,0 +1,34 @@
+; RUN: llc -mcpu=pwr7 -O1 < %s | FileCheck %s
+
+; This test verifies that the peephole optimization of address accesses
+; does not produce a load or store with a relocation that can't be
+; satisfied for a given instruction encoding. Reduced from a test supplied
+; by Hal Finkel.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S1 = type { [8 x i8] }
+
+@main.l_1554 = internal global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 -1, i8 -6, i8 57, i8 62, i8 -48, i8 0, i8 58, i8 80 }, align 1
+
+; Function Attrs: nounwind readonly
+define signext i32 @main() #0 {
+entry:
+ %call = tail call fastcc signext i32 @func_90(%struct.S1* byval bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @main.l_1554 to %struct.S1*))
+; CHECK-NOT: ld {{[0-9]+}}, main.l_1554@toc@l
+ ret i32 %call
+}
+
+; Function Attrs: nounwind readonly
+define internal fastcc signext i32 @func_90(%struct.S1* byval nocapture %p_91) #0 {
+entry:
+ %0 = bitcast %struct.S1* %p_91 to i64*
+ %bf.load = load i64* %0, align 1
+ %bf.shl = shl i64 %bf.load, 26
+ %bf.ashr = ashr i64 %bf.shl, 54
+ %bf.cast = trunc i64 %bf.ashr to i32
+ ret i32 %bf.cast
+}
+
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: lib/Target/PowerPC/PPCAsmPrinter.cpp
===================================================================
--- lib/Target/PowerPC/PPCAsmPrinter.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCAsmPrinter.cpp (working copy)
@@ -679,7 +679,26 @@ void PPCAsmPrinter::EmitInstruction(const MachineI
OutStreamer.EmitRawText(StringRef("\tmsync"));
return;
}
+ break;
+ case PPC::LD:
+ case PPC::STD:
+ case PPC::LWA: {
+ // Verify alignment is legal, so we don't create relocations
+ // that can't be supported.
+ // FIXME: This test is currently disabled for Darwin. The test
+ // suite shows a handful of test cases that fail this check for
+ // Darwin. Those need to be investigated before this sanity test
+ // can be enabled for those subtargets.
+ if (!Subtarget.isDarwin()) {
+ unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
+ llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
+ }
+ // Now process the instruction normally.
+ break;
}
+ }
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
OutStreamer.EmitInstruction(TmpInst);
Index: lib/Target/PowerPC/PPCISelDAGToDAG.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelDAGToDAG.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCISelDAGToDAG.cpp (working copy)
@@ -1530,6 +1530,14 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
SDLoc dl(GA);
const GlobalValue *GV = GA->getGlobal();
+ // We can't perform this optimization for data whose alignment
+ // is insufficient for the instruction encoding.
+ if (GV->getAlignment() < 4 &&
+ (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
+ StorageOpcode == PPC::LWA)) {
+ DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
+ continue;
+ }
ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
} else if (ConstantPoolSDNode *CP =
dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
llvm-svn: 185380
2013-07-02 04:52:27 +08:00
|
|
|
case PPC::LD:
|
|
|
|
case PPC::STD:
|
2013-08-30 23:18:11 +08:00
|
|
|
case PPC::LWA_32:
|
Index: test/CodeGen/PowerPC/reloc-align.ll
===================================================================
--- test/CodeGen/PowerPC/reloc-align.ll (revision 0)
+++ test/CodeGen/PowerPC/reloc-align.ll (revision 0)
@@ -0,0 +1,34 @@
+; RUN: llc -mcpu=pwr7 -O1 < %s | FileCheck %s
+
+; This test verifies that the peephole optimization of address accesses
+; does not produce a load or store with a relocation that can't be
+; satisfied for a given instruction encoding. Reduced from a test supplied
+; by Hal Finkel.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S1 = type { [8 x i8] }
+
+@main.l_1554 = internal global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 -1, i8 -6, i8 57, i8 62, i8 -48, i8 0, i8 58, i8 80 }, align 1
+
+; Function Attrs: nounwind readonly
+define signext i32 @main() #0 {
+entry:
+ %call = tail call fastcc signext i32 @func_90(%struct.S1* byval bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @main.l_1554 to %struct.S1*))
+; CHECK-NOT: ld {{[0-9]+}}, main.l_1554@toc@l
+ ret i32 %call
+}
+
+; Function Attrs: nounwind readonly
+define internal fastcc signext i32 @func_90(%struct.S1* byval nocapture %p_91) #0 {
+entry:
+ %0 = bitcast %struct.S1* %p_91 to i64*
+ %bf.load = load i64* %0, align 1
+ %bf.shl = shl i64 %bf.load, 26
+ %bf.ashr = ashr i64 %bf.shl, 54
+ %bf.cast = trunc i64 %bf.ashr to i32
+ ret i32 %bf.cast
+}
+
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: lib/Target/PowerPC/PPCAsmPrinter.cpp
===================================================================
--- lib/Target/PowerPC/PPCAsmPrinter.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCAsmPrinter.cpp (working copy)
@@ -679,7 +679,26 @@ void PPCAsmPrinter::EmitInstruction(const MachineI
OutStreamer.EmitRawText(StringRef("\tmsync"));
return;
}
+ break;
+ case PPC::LD:
+ case PPC::STD:
+ case PPC::LWA: {
+ // Verify alignment is legal, so we don't create relocations
+ // that can't be supported.
+ // FIXME: This test is currently disabled for Darwin. The test
+ // suite shows a handful of test cases that fail this check for
+ // Darwin. Those need to be investigated before this sanity test
+ // can be enabled for those subtargets.
+ if (!Subtarget.isDarwin()) {
+ unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
+ llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
+ }
+ // Now process the instruction normally.
+ break;
}
+ }
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
OutStreamer.EmitInstruction(TmpInst);
Index: lib/Target/PowerPC/PPCISelDAGToDAG.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelDAGToDAG.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCISelDAGToDAG.cpp (working copy)
@@ -1530,6 +1530,14 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
SDLoc dl(GA);
const GlobalValue *GV = GA->getGlobal();
+ // We can't perform this optimization for data whose alignment
+ // is insufficient for the instruction encoding.
+ if (GV->getAlignment() < 4 &&
+ (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
+ StorageOpcode == PPC::LWA)) {
+ DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
+ continue;
+ }
ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
} else if (ConstantPoolSDNode *CP =
dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
llvm-svn: 185380
2013-07-02 04:52:27 +08:00
|
|
|
case PPC::LWA: {
|
|
|
|
// Verify alignment is legal, so we don't create relocations
|
|
|
|
// that can't be supported.
|
|
|
|
// FIXME: This test is currently disabled for Darwin. The test
|
|
|
|
// suite shows a handful of test cases that fail this check for
|
|
|
|
// Darwin. Those need to be investigated before this sanity test
|
|
|
|
// can be enabled for those subtargets.
|
2015-02-10 08:44:17 +08:00
|
|
|
if (!Subtarget->isDarwin()) {
|
Index: test/CodeGen/PowerPC/reloc-align.ll
===================================================================
--- test/CodeGen/PowerPC/reloc-align.ll (revision 0)
+++ test/CodeGen/PowerPC/reloc-align.ll (revision 0)
@@ -0,0 +1,34 @@
+; RUN: llc -mcpu=pwr7 -O1 < %s | FileCheck %s
+
+; This test verifies that the peephole optimization of address accesses
+; does not produce a load or store with a relocation that can't be
+; satisfied for a given instruction encoding. Reduced from a test supplied
+; by Hal Finkel.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S1 = type { [8 x i8] }
+
+@main.l_1554 = internal global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 -1, i8 -6, i8 57, i8 62, i8 -48, i8 0, i8 58, i8 80 }, align 1
+
+; Function Attrs: nounwind readonly
+define signext i32 @main() #0 {
+entry:
+ %call = tail call fastcc signext i32 @func_90(%struct.S1* byval bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @main.l_1554 to %struct.S1*))
+; CHECK-NOT: ld {{[0-9]+}}, main.l_1554@toc@l
+ ret i32 %call
+}
+
+; Function Attrs: nounwind readonly
+define internal fastcc signext i32 @func_90(%struct.S1* byval nocapture %p_91) #0 {
+entry:
+ %0 = bitcast %struct.S1* %p_91 to i64*
+ %bf.load = load i64* %0, align 1
+ %bf.shl = shl i64 %bf.load, 26
+ %bf.ashr = ashr i64 %bf.shl, 54
+ %bf.cast = trunc i64 %bf.ashr to i32
+ ret i32 %bf.cast
+}
+
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: lib/Target/PowerPC/PPCAsmPrinter.cpp
===================================================================
--- lib/Target/PowerPC/PPCAsmPrinter.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCAsmPrinter.cpp (working copy)
@@ -679,7 +679,26 @@ void PPCAsmPrinter::EmitInstruction(const MachineI
OutStreamer.EmitRawText(StringRef("\tmsync"));
return;
}
+ break;
+ case PPC::LD:
+ case PPC::STD:
+ case PPC::LWA: {
+ // Verify alignment is legal, so we don't create relocations
+ // that can't be supported.
+ // FIXME: This test is currently disabled for Darwin. The test
+ // suite shows a handful of test cases that fail this check for
+ // Darwin. Those need to be investigated before this sanity test
+ // can be enabled for those subtargets.
+ if (!Subtarget.isDarwin()) {
+ unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
+ llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
+ }
+ // Now process the instruction normally.
+ break;
}
+ }
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
OutStreamer.EmitInstruction(TmpInst);
Index: lib/Target/PowerPC/PPCISelDAGToDAG.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelDAGToDAG.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCISelDAGToDAG.cpp (working copy)
@@ -1530,6 +1530,14 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
SDLoc dl(GA);
const GlobalValue *GV = GA->getGlobal();
+ // We can't perform this optimization for data whose alignment
+ // is insufficient for the instruction encoding.
+ if (GV->getAlignment() < 4 &&
+ (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
+ StorageOpcode == PPC::LWA)) {
+ DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
+ continue;
+ }
ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
} else if (ConstantPoolSDNode *CP =
dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
llvm-svn: 185380
2013-07-02 04:52:27 +08:00
|
|
|
unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
|
|
|
|
const MachineOperand &MO = MI->getOperand(OpNum);
|
|
|
|
if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
|
|
|
|
llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
|
|
|
|
}
|
|
|
|
// Now process the instruction normally.
|
|
|
|
break;
|
|
|
|
}
|
2010-01-28 09:28:58 +08:00
|
|
|
}
|
2005-04-22 07:30:14 +08:00
|
|
|
|
2014-11-12 23:16:30 +08:00
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
|
2015-04-25 03:11:51 +08:00
|
|
|
EmitToStreamer(*OutStreamer, TmpInst);
|
2004-09-04 13:00:00 +08:00
|
|
|
}
|
|
|
|
|
[XRay] Implement powerpc64le xray.
Summary:
powerpc64 big-endian is not supported, but I believe that most logic can
be shared, except for xray_powerpc64.cc.
Also add a function InvalidateInstructionCache to xray_util.h, which is
copied from llvm/Support/Memory.cpp. I'm not sure if I need to add a unittest,
and I don't know how.
Reviewers: dberris, echristo, iteratee, kbarton, hfinkel
Subscribers: mehdi_amini, nemanjai, mgorny, llvm-commits
Differential Revision: https://reviews.llvm.org/D29742
llvm-svn: 294781
2017-02-11 05:03:24 +08:00
|
|
|
void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
|
|
|
|
if (!Subtarget->isPPC64())
|
|
|
|
return PPCAsmPrinter::EmitInstruction(MI);
|
|
|
|
|
|
|
|
switch (MI->getOpcode()) {
|
|
|
|
default:
|
|
|
|
return PPCAsmPrinter::EmitInstruction(MI);
|
|
|
|
case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
|
|
|
|
// .begin:
|
|
|
|
// b .end # lis 0, FuncId[16..32]
|
|
|
|
// nop # li 0, FuncId[0..15]
|
|
|
|
// std 0, -8(1)
|
|
|
|
// mflr 0
|
|
|
|
// bl __xray_FunctionEntry
|
|
|
|
// mtlr 0
|
|
|
|
// .end:
|
|
|
|
//
|
|
|
|
// Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
|
|
|
|
// of instructions change.
|
|
|
|
MCSymbol *BeginOfSled = OutContext.createTempSymbol();
|
|
|
|
MCSymbol *EndOfSled = OutContext.createTempSymbol();
|
|
|
|
OutStreamer->EmitLabel(BeginOfSled);
|
|
|
|
EmitToStreamer(*OutStreamer,
|
|
|
|
MCInstBuilder(PPC::B).addExpr(
|
|
|
|
MCSymbolRefExpr::create(EndOfSled, OutContext)));
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
|
|
|
|
EmitToStreamer(
|
|
|
|
*OutStreamer,
|
|
|
|
MCInstBuilder(PPC::STD).addReg(PPC::X0).addImm(-8).addReg(PPC::X1));
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR8).addReg(PPC::X0));
|
|
|
|
EmitToStreamer(*OutStreamer,
|
|
|
|
MCInstBuilder(PPC::BL8_NOP)
|
|
|
|
.addExpr(MCSymbolRefExpr::create(
|
|
|
|
OutContext.getOrCreateSymbol("__xray_FunctionEntry"),
|
|
|
|
OutContext)));
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0));
|
|
|
|
OutStreamer->EmitLabel(EndOfSled);
|
|
|
|
recordSled(BeginOfSled, *MI, SledKind::FUNCTION_ENTER);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case TargetOpcode::PATCHABLE_FUNCTION_EXIT: {
|
|
|
|
// .p2align 3
|
|
|
|
// .begin:
|
|
|
|
// b(lr)? # lis 0, FuncId[16..32]
|
|
|
|
// nop # li 0, FuncId[0..15]
|
|
|
|
// std 0, -8(1)
|
|
|
|
// mflr 0
|
|
|
|
// bl __xray_FunctionExit
|
|
|
|
// mtlr 0
|
|
|
|
// .end:
|
|
|
|
// b(lr)?
|
|
|
|
//
|
|
|
|
// Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
|
|
|
|
// of instructions change.
|
|
|
|
const MachineInstr *Next = [&] {
|
|
|
|
MachineBasicBlock::const_iterator It(MI);
|
2017-02-11 06:13:34 +08:00
|
|
|
assert(It != MI->getParent()->end());
|
[XRay] Implement powerpc64le xray.
Summary:
powerpc64 big-endian is not supported, but I believe that most logic can
be shared, except for xray_powerpc64.cc.
Also add a function InvalidateInstructionCache to xray_util.h, which is
copied from llvm/Support/Memory.cpp. I'm not sure if I need to add a unittest,
and I don't know how.
Reviewers: dberris, echristo, iteratee, kbarton, hfinkel
Subscribers: mehdi_amini, nemanjai, mgorny, llvm-commits
Differential Revision: https://reviews.llvm.org/D29742
llvm-svn: 294781
2017-02-11 05:03:24 +08:00
|
|
|
++It;
|
|
|
|
assert(It->isReturn());
|
|
|
|
return &*It;
|
|
|
|
}();
|
|
|
|
OutStreamer->EmitCodeAlignment(8);
|
|
|
|
MCSymbol *BeginOfSled = OutContext.createTempSymbol();
|
|
|
|
OutStreamer->EmitLabel(BeginOfSled);
|
|
|
|
MCInst TmpInst;
|
|
|
|
LowerPPCMachineInstrToMCInst(Next, TmpInst, *this, false);
|
|
|
|
EmitToStreamer(*OutStreamer, TmpInst);
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
|
|
|
|
EmitToStreamer(
|
|
|
|
*OutStreamer,
|
|
|
|
MCInstBuilder(PPC::STD).addReg(PPC::X0).addImm(-8).addReg(PPC::X1));
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR8).addReg(PPC::X0));
|
|
|
|
EmitToStreamer(*OutStreamer,
|
|
|
|
MCInstBuilder(PPC::BL8_NOP)
|
|
|
|
.addExpr(MCSymbolRefExpr::create(
|
|
|
|
OutContext.getOrCreateSymbol("__xray_FunctionExit"),
|
|
|
|
OutContext)));
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0));
|
|
|
|
recordSled(BeginOfSled, *MI, SledKind::FUNCTION_EXIT);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case TargetOpcode::PATCHABLE_TAIL_CALL:
|
|
|
|
case TargetOpcode::PATCHABLE_RET:
|
|
|
|
// PPC's tail call instruction, e.g. PPC::TCRETURNdi8, doesn't really
|
|
|
|
// lower to a PPC::B instruction. The PPC::B instruction is generated
|
|
|
|
// before it, and handled by the normal case.
|
2017-02-11 05:17:35 +08:00
|
|
|
llvm_unreachable("Tail call is handled in the normal case. See comments"
|
|
|
|
"around this assert.");
|
[XRay] Implement powerpc64le xray.
Summary:
powerpc64 big-endian is not supported, but I believe that most logic can
be shared, except for xray_powerpc64.cc.
Also add a function InvalidateInstructionCache to xray_util.h, which is
copied from llvm/Support/Memory.cpp. I'm not sure if I need to add a unittest,
and I don't know how.
Reviewers: dberris, echristo, iteratee, kbarton, hfinkel
Subscribers: mehdi_amini, nemanjai, mgorny, llvm-commits
Differential Revision: https://reviews.llvm.org/D29742
llvm-svn: 294781
2017-02-11 05:03:24 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-19 07:29:49 +08:00
|
|
|
void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
|
2015-02-17 15:21:21 +08:00
|
|
|
if (static_cast<const PPCTargetMachine &>(TM).isELFv2ABI()) {
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
PPCTargetStreamer *TS =
|
2015-04-25 03:11:51 +08:00
|
|
|
static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
|
|
|
|
if (TS)
|
|
|
|
TS->emitAbiVersion(2);
|
|
|
|
}
|
|
|
|
|
2015-02-17 15:21:21 +08:00
|
|
|
if (static_cast<const PPCTargetMachine &>(TM).isPPC64() ||
|
2016-06-27 22:19:45 +08:00
|
|
|
!isPositionIndependent())
|
2014-07-19 07:29:49 +08:00
|
|
|
return AsmPrinter::EmitStartOfAsmFile(M);
|
|
|
|
|
2016-06-18 02:07:14 +08:00
|
|
|
if (M.getPICLevel() == PICLevel::SmallPIC)
|
2014-11-12 23:16:30 +08:00
|
|
|
return AsmPrinter::EmitStartOfAsmFile(M);
|
|
|
|
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->SwitchSection(OutContext.getELFSection(
|
2015-01-30 01:33:21 +08:00
|
|
|
".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC));
|
2014-07-19 07:29:49 +08:00
|
|
|
|
2015-05-19 02:43:14 +08:00
|
|
|
MCSymbol *TOCSym = OutContext.getOrCreateSymbol(Twine(".LTOC"));
|
|
|
|
MCSymbol *CurrentPos = OutContext.createTempSymbol();
|
2014-07-19 07:29:49 +08:00
|
|
|
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitLabel(CurrentPos);
|
2014-07-19 07:29:49 +08:00
|
|
|
|
|
|
|
// The GOT pointer points to the middle of the GOT, in order to reference the
|
|
|
|
// entire 64kB range. 0x8000 is the midpoint.
|
|
|
|
const MCExpr *tocExpr =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCBinaryExpr::createAdd(MCSymbolRefExpr::create(CurrentPos, OutContext),
|
|
|
|
MCConstantExpr::create(0x8000, OutContext),
|
2014-07-19 07:29:49 +08:00
|
|
|
OutContext);
|
|
|
|
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitAssignment(TOCSym, tocExpr);
|
2014-07-19 07:29:49 +08:00
|
|
|
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
|
2014-07-19 07:29:49 +08:00
|
|
|
}
|
|
|
|
|
2010-01-27 15:21:55 +08:00
|
|
|
void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
|
2014-07-19 07:29:49 +08:00
|
|
|
// linux/ppc32 - Normal entry label.
|
2015-09-22 19:19:03 +08:00
|
|
|
if (!Subtarget->isPPC64() &&
|
2016-06-27 22:19:45 +08:00
|
|
|
(!isPositionIndependent() ||
|
2016-06-18 02:07:14 +08:00
|
|
|
MF->getFunction()->getParent()->getPICLevel() == PICLevel::SmallPIC))
|
2010-01-27 15:21:55 +08:00
|
|
|
return AsmPrinter::EmitFunctionEntryLabel();
|
2014-07-19 07:29:49 +08:00
|
|
|
|
2015-02-10 08:44:17 +08:00
|
|
|
if (!Subtarget->isPPC64()) {
|
2014-07-19 07:29:49 +08:00
|
|
|
const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
|
2015-09-22 19:15:07 +08:00
|
|
|
if (PPCFI->usesPICBase()) {
|
2014-07-19 07:29:49 +08:00
|
|
|
MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol();
|
|
|
|
MCSymbol *PICBase = MF->getPICBaseSymbol();
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitLabel(RelocSymbol);
|
2014-07-19 07:29:49 +08:00
|
|
|
|
|
|
|
const MCExpr *OffsExpr =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCBinaryExpr::createSub(
|
|
|
|
MCSymbolRefExpr::create(OutContext.getOrCreateSymbol(Twine(".LTOC")),
|
2014-07-19 07:29:49 +08:00
|
|
|
OutContext),
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(PICBase, OutContext),
|
2014-07-19 07:29:49 +08:00
|
|
|
OutContext);
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitValue(OffsExpr, 4);
|
|
|
|
OutStreamer->EmitLabel(CurrentFnSym);
|
2014-07-19 07:29:49 +08:00
|
|
|
return;
|
|
|
|
} else
|
|
|
|
return AsmPrinter::EmitFunctionEntryLabel();
|
|
|
|
}
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
|
|
|
|
// ELFv2 ABI - Normal entry label.
|
2016-01-13 21:12:23 +08:00
|
|
|
if (Subtarget->isELFv2ABI()) {
|
|
|
|
// In the Large code model, we allow arbitrary displacements between
|
|
|
|
// the text section and its associated TOC section. We place the
|
|
|
|
// full 8-byte offset to the TOC in memory immediatedly preceding
|
|
|
|
// the function global entry point.
|
|
|
|
if (TM.getCodeModel() == CodeModel::Large
|
|
|
|
&& !MF->getRegInfo().use_empty(PPC::X2)) {
|
|
|
|
const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
|
|
|
|
|
|
|
|
MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC."));
|
|
|
|
MCSymbol *GlobalEPSymbol = PPCFI->getGlobalEPSymbol();
|
|
|
|
const MCExpr *TOCDeltaExpr =
|
|
|
|
MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
|
|
|
|
MCSymbolRefExpr::create(GlobalEPSymbol,
|
|
|
|
OutContext),
|
|
|
|
OutContext);
|
|
|
|
|
|
|
|
OutStreamer->EmitLabel(PPCFI->getTOCOffsetSymbol());
|
|
|
|
OutStreamer->EmitValue(TOCDeltaExpr, 8);
|
|
|
|
}
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
return AsmPrinter::EmitFunctionEntryLabel();
|
2016-01-13 21:12:23 +08:00
|
|
|
}
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
|
2010-01-27 15:21:55 +08:00
|
|
|
// Emit an official procedure descriptor.
|
2015-04-25 03:11:51 +08:00
|
|
|
MCSectionSubPair Current = OutStreamer->getCurrentSection();
|
2015-05-22 03:20:38 +08:00
|
|
|
MCSectionELF *Section = OutStreamer->getContext().getELFSection(
|
2015-01-30 01:33:21 +08:00
|
|
|
".opd", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->SwitchSection(Section);
|
|
|
|
OutStreamer->EmitLabel(CurrentFnSym);
|
|
|
|
OutStreamer->EmitValueToAlignment(8);
|
2015-03-06 03:47:50 +08:00
|
|
|
MCSymbol *Symbol1 = CurrentFnSymForSize;
|
2012-10-25 20:27:42 +08:00
|
|
|
// Generates a R_PPC64_ADDR64 (from FK_DATA_8) relocation for the function
|
|
|
|
// entry point.
|
2015-05-30 09:25:56 +08:00
|
|
|
OutStreamer->EmitValue(MCSymbolRefExpr::create(Symbol1, OutContext),
|
2015-04-25 03:11:51 +08:00
|
|
|
8 /*size*/);
|
2015-05-19 02:43:14 +08:00
|
|
|
MCSymbol *Symbol2 = OutContext.getOrCreateSymbol(StringRef(".TOC."));
|
2012-10-25 20:27:42 +08:00
|
|
|
// Generates a R_PPC64_TOC relocation for TOC base insertion.
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitValue(
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(Symbol2, MCSymbolRefExpr::VK_PPC_TOCBASE, OutContext),
|
2015-04-25 03:11:51 +08:00
|
|
|
8/*size*/);
|
2012-09-19 00:55:29 +08:00
|
|
|
// Emit a null environment pointer.
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitIntValue(0, 8 /* size */);
|
|
|
|
OutStreamer->SwitchSection(Current.first, Current.second);
|
2010-01-27 15:21:55 +08:00
|
|
|
}
|
|
|
|
|
2009-08-15 19:54:46 +08:00
|
|
|
bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
|
2015-07-16 14:11:10 +08:00
|
|
|
const DataLayout &DL = getDataLayout();
|
2009-08-15 19:54:46 +08:00
|
|
|
|
2015-07-16 14:11:10 +08:00
|
|
|
bool isPPC64 = DL.getPointerSizeInBits() == 64;
|
2009-08-15 19:54:46 +08:00
|
|
|
|
2013-10-08 21:08:17 +08:00
|
|
|
PPCTargetStreamer &TS =
|
2015-04-25 03:11:51 +08:00
|
|
|
static_cast<PPCTargetStreamer &>(*OutStreamer->getTargetStreamer());
|
2013-10-08 21:08:17 +08:00
|
|
|
|
2014-07-19 07:29:49 +08:00
|
|
|
if (!TOC.empty()) {
|
2015-05-22 03:20:38 +08:00
|
|
|
MCSectionELF *Section;
|
|
|
|
|
2014-07-19 07:29:49 +08:00
|
|
|
if (isPPC64)
|
2015-04-25 03:11:51 +08:00
|
|
|
Section = OutStreamer->getContext().getELFSection(
|
2015-01-30 01:33:21 +08:00
|
|
|
".toc", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
|
|
|
|
else
|
2015-04-25 03:11:51 +08:00
|
|
|
Section = OutStreamer->getContext().getELFSection(
|
2015-01-30 01:33:21 +08:00
|
|
|
".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->SwitchSection(Section);
|
2009-08-15 19:54:46 +08:00
|
|
|
|
2012-11-13 03:13:24 +08:00
|
|
|
for (MapVector<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(),
|
2010-01-16 10:09:06 +08:00
|
|
|
E = TOC.end(); I != E; ++I) {
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitLabel(I->second);
|
2014-10-31 18:33:14 +08:00
|
|
|
MCSymbol *S = I->first;
|
2016-08-30 09:43:38 +08:00
|
|
|
if (isPPC64) {
|
2014-07-19 07:29:49 +08:00
|
|
|
TS.emitTCEntry(*S);
|
2016-08-30 09:43:38 +08:00
|
|
|
} else {
|
|
|
|
OutStreamer->EmitValueToAlignment(4);
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitSymbolValue(S, 4);
|
2016-08-30 09:43:38 +08:00
|
|
|
}
|
2009-08-15 19:54:46 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return AsmPrinter::doFinalization(M);
|
|
|
|
}
|
2006-12-22 04:26:09 +08:00
|
|
|
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
/// EmitFunctionBodyStart - Emit a global entry point prefix for ELFv2.
|
|
|
|
void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
|
|
|
|
// In the ELFv2 ABI, in functions that use the TOC register, we need to
|
|
|
|
// provide two entry points. The ABI guarantees that when calling the
|
|
|
|
// local entry point, r2 is set up by the caller to contain the TOC base
|
|
|
|
// for this function, and when calling the global entry point, r12 is set
|
|
|
|
// up by the caller to hold the address of the global entry point. We
|
|
|
|
// thus emit a prefix sequence along the following lines:
|
|
|
|
//
|
|
|
|
// func:
|
2016-01-13 21:12:23 +08:00
|
|
|
// .Lfunc_gepNN:
|
|
|
|
// # global entry point
|
|
|
|
// addis r2,r12,(.TOC.-.Lfunc_gepNN)@ha
|
|
|
|
// addi r2,r2,(.TOC.-.Lfunc_gepNN)@l
|
|
|
|
// .Lfunc_lepNN:
|
|
|
|
// .localentry func, .Lfunc_lepNN-.Lfunc_gepNN
|
|
|
|
// # local entry point, followed by function body
|
|
|
|
//
|
|
|
|
// For the Large code model, we create
|
|
|
|
//
|
|
|
|
// .Lfunc_tocNN:
|
|
|
|
// .quad .TOC.-.Lfunc_gepNN # done by EmitFunctionEntryLabel
|
|
|
|
// func:
|
|
|
|
// .Lfunc_gepNN:
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
// # global entry point
|
2016-01-13 21:12:23 +08:00
|
|
|
// ld r2,.Lfunc_tocNN-.Lfunc_gepNN(r12)
|
|
|
|
// add r2,r2,r12
|
|
|
|
// .Lfunc_lepNN:
|
|
|
|
// .localentry func, .Lfunc_lepNN-.Lfunc_gepNN
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
// # local entry point, followed by function body
|
|
|
|
//
|
|
|
|
// This ensures we have r2 set up correctly while executing the function
|
|
|
|
// body, no matter which entry point is called.
|
2015-02-10 08:44:17 +08:00
|
|
|
if (Subtarget->isELFv2ABI()
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
// Only do all that if the function uses r2 in the first place.
|
|
|
|
&& !MF->getRegInfo().use_empty(PPC::X2)) {
|
2016-10-03 12:06:44 +08:00
|
|
|
// Note: The logic here must be synchronized with the code in the
|
|
|
|
// branch-selection pass which sets the offset of the first block in the
|
|
|
|
// function. This matters because it affects the alignment.
|
2016-01-13 21:12:23 +08:00
|
|
|
const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
|
2016-01-13 21:12:23 +08:00
|
|
|
MCSymbol *GlobalEntryLabel = PPCFI->getGlobalEPSymbol();
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitLabel(GlobalEntryLabel);
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
const MCSymbolRefExpr *GlobalEntryLabelExp =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(GlobalEntryLabel, OutContext);
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
|
2016-01-13 21:12:23 +08:00
|
|
|
if (TM.getCodeModel() != CodeModel::Large) {
|
|
|
|
MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC."));
|
|
|
|
const MCExpr *TOCDeltaExpr =
|
|
|
|
MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
|
|
|
|
GlobalEntryLabelExp, OutContext);
|
|
|
|
|
|
|
|
const MCExpr *TOCDeltaHi =
|
|
|
|
PPCMCExpr::createHa(TOCDeltaExpr, false, OutContext);
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
|
|
|
|
.addReg(PPC::X2)
|
|
|
|
.addReg(PPC::X12)
|
|
|
|
.addExpr(TOCDeltaHi));
|
|
|
|
|
|
|
|
const MCExpr *TOCDeltaLo =
|
|
|
|
PPCMCExpr::createLo(TOCDeltaExpr, false, OutContext);
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
|
|
|
|
.addReg(PPC::X2)
|
|
|
|
.addReg(PPC::X2)
|
|
|
|
.addExpr(TOCDeltaLo));
|
|
|
|
} else {
|
|
|
|
MCSymbol *TOCOffset = PPCFI->getTOCOffsetSymbol();
|
|
|
|
const MCExpr *TOCOffsetDeltaExpr =
|
|
|
|
MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCOffset, OutContext),
|
|
|
|
GlobalEntryLabelExp, OutContext);
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
|
2016-01-13 21:12:23 +08:00
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
|
|
|
|
.addReg(PPC::X2)
|
|
|
|
.addExpr(TOCOffsetDeltaExpr)
|
|
|
|
.addReg(PPC::X12));
|
|
|
|
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADD8)
|
|
|
|
.addReg(PPC::X2)
|
|
|
|
.addReg(PPC::X2)
|
|
|
|
.addReg(PPC::X12));
|
|
|
|
}
|
|
|
|
|
|
|
|
MCSymbol *LocalEntryLabel = PPCFI->getLocalEPSymbol();
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitLabel(LocalEntryLabel);
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
const MCSymbolRefExpr *LocalEntryLabelExp =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCSymbolRefExpr::create(LocalEntryLabel, OutContext);
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
const MCExpr *LocalOffsetExp =
|
2015-05-30 09:25:56 +08:00
|
|
|
MCBinaryExpr::createSub(LocalEntryLabelExp,
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
GlobalEntryLabelExp, OutContext);
|
|
|
|
|
|
|
|
PPCTargetStreamer *TS =
|
2015-04-25 03:11:51 +08:00
|
|
|
static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
|
|
|
|
if (TS)
|
2015-06-03 04:38:46 +08:00
|
|
|
TS->emitLocalEntry(cast<MCSymbolELF>(CurrentFnSym), LocalOffsetExp);
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-08-29 03:06:55 +08:00
|
|
|
/// EmitFunctionBodyEnd - Print the traceback table before the .size
|
|
|
|
/// directive.
|
|
|
|
///
|
|
|
|
void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() {
|
|
|
|
// Only the 64-bit target requires a traceback table. For now,
|
|
|
|
// we only emit the word of zeroes that GDB requires to find
|
2012-08-30 04:22:24 +08:00
|
|
|
// the end of the function, and zeroes for the eight-byte
|
|
|
|
// mandatory fields.
|
|
|
|
// FIXME: We should fill in the eight-byte mandatory fields as described in
|
|
|
|
// the PPC64 ELF ABI (this is a low-priority item because GDB does not
|
|
|
|
// currently make use of these fields).
|
2015-02-10 08:44:17 +08:00
|
|
|
if (Subtarget->isPPC64()) {
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitIntValue(0, 4/*size*/);
|
|
|
|
OutStreamer->EmitIntValue(0, 8/*size*/);
|
2012-08-30 04:22:24 +08:00
|
|
|
}
|
2012-08-29 03:06:55 +08:00
|
|
|
}
|
|
|
|
|
2009-10-01 06:06:26 +08:00
|
|
|
void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
|
2008-03-26 05:45:14 +08:00
|
|
|
static const char *const CPUDirectives[] = {
|
2008-02-15 07:35:16 +08:00
|
|
|
"",
|
2006-12-13 04:57:08 +08:00
|
|
|
"ppc",
|
2011-10-17 12:03:49 +08:00
|
|
|
"ppc440",
|
2006-12-13 04:57:08 +08:00
|
|
|
"ppc601",
|
|
|
|
"ppc602",
|
|
|
|
"ppc603",
|
|
|
|
"ppc7400",
|
|
|
|
"ppc750",
|
|
|
|
"ppc970",
|
2012-04-02 03:22:40 +08:00
|
|
|
"ppcA2",
|
2012-08-29 00:12:39 +08:00
|
|
|
"ppce500mc",
|
|
|
|
"ppce5500",
|
2013-02-04 08:47:33 +08:00
|
|
|
"power3",
|
|
|
|
"power4",
|
|
|
|
"power5",
|
|
|
|
"power5x",
|
2012-06-11 23:43:08 +08:00
|
|
|
"power6",
|
2013-02-04 08:47:33 +08:00
|
|
|
"power6x",
|
2012-06-11 23:43:08 +08:00
|
|
|
"power7",
|
2016-05-10 02:54:58 +08:00
|
|
|
// FIXME: why is power8 missing here?
|
2013-07-26 09:35:43 +08:00
|
|
|
"ppc64",
|
2016-05-10 02:54:58 +08:00
|
|
|
"ppc64le",
|
|
|
|
"power9"
|
2006-12-13 04:57:08 +08:00
|
|
|
};
|
|
|
|
|
2015-02-17 15:21:21 +08:00
|
|
|
// Get the numerically largest directive.
|
|
|
|
// FIXME: How should we merge darwin directives?
|
|
|
|
unsigned Directive = PPC::DIR_NONE;
|
|
|
|
for (const Function &F : M) {
|
2015-02-20 15:32:59 +08:00
|
|
|
const PPCSubtarget &STI = TM.getSubtarget<PPCSubtarget>(F);
|
2015-02-17 15:21:21 +08:00
|
|
|
unsigned FDir = STI.getDarwinDirective();
|
|
|
|
Directive = Directive > FDir ? FDir : STI.getDarwinDirective();
|
|
|
|
if (STI.hasMFOCRF() && Directive < PPC::DIR_970)
|
|
|
|
Directive = PPC::DIR_970;
|
|
|
|
if (STI.hasAltivec() && Directive < PPC::DIR_7400)
|
|
|
|
Directive = PPC::DIR_7400;
|
|
|
|
if (STI.isPPC64() && Directive < PPC::DIR_64)
|
|
|
|
Directive = PPC::DIR_64;
|
|
|
|
}
|
|
|
|
|
2006-12-13 04:57:08 +08:00
|
|
|
assert(Directive <= PPC::DIR_64 && "Directive out of range.");
|
2014-01-25 10:35:56 +08:00
|
|
|
|
|
|
|
assert(Directive < array_lengthof(CPUDirectives) &&
|
|
|
|
"CPUDirectives[] might not be up-to-date!");
|
|
|
|
PPCTargetStreamer &TStreamer =
|
2015-04-25 03:11:51 +08:00
|
|
|
*static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
|
2014-01-25 10:35:56 +08:00
|
|
|
TStreamer.emitMachine(CPUDirectives[Directive]);
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2006-11-29 02:21:52 +08:00
|
|
|
// Prime text sections so they are adjacent. This reduces the likelihood a
|
|
|
|
// large data or debug section causes a branch to exceed 16M limit.
|
2015-09-22 19:13:55 +08:00
|
|
|
const TargetLoweringObjectFileMachO &TLOFMacho =
|
|
|
|
static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->SwitchSection(TLOFMacho.getTextCoalSection());
|
2006-11-29 02:21:52 +08:00
|
|
|
if (TM.getRelocationModel() == Reloc::PIC_) {
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->SwitchSection(
|
2010-04-09 04:40:11 +08:00
|
|
|
OutContext.getMachOSection("__TEXT", "__picsymbolstub1",
|
2014-03-07 15:36:05 +08:00
|
|
|
MachO::S_SYMBOL_STUBS |
|
|
|
|
MachO::S_ATTR_PURE_INSTRUCTIONS,
|
2009-08-19 13:49:37 +08:00
|
|
|
32, SectionKind::getText()));
|
2006-11-29 02:21:52 +08:00
|
|
|
} else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) {
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->SwitchSection(
|
2010-04-09 04:40:11 +08:00
|
|
|
OutContext.getMachOSection("__TEXT","__symbol_stub1",
|
2014-03-07 15:36:05 +08:00
|
|
|
MachO::S_SYMBOL_STUBS |
|
|
|
|
MachO::S_ATTR_PURE_INSTRUCTIONS,
|
2009-08-19 13:49:37 +08:00
|
|
|
16, SectionKind::getText()));
|
2006-11-29 02:21:52 +08:00
|
|
|
}
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
|
2005-07-21 09:25:49 +08:00
|
|
|
}
|
|
|
|
|
2010-01-21 05:19:44 +08:00
|
|
|
bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
|
2015-07-16 14:11:10 +08:00
|
|
|
bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64;
|
2010-01-21 05:19:44 +08:00
|
|
|
|
|
|
|
// Darwin/PPC always uses mach-o.
|
2015-09-22 19:13:55 +08:00
|
|
|
const TargetLoweringObjectFileMachO &TLOFMacho =
|
|
|
|
static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
|
2016-11-04 07:33:46 +08:00
|
|
|
if (MMI) {
|
|
|
|
MachineModuleInfoMachO &MMIMacho =
|
|
|
|
MMI->getObjFileInfo<MachineModuleInfoMachO>();
|
|
|
|
|
|
|
|
if (MAI->doesSupportExceptionHandling()) {
|
|
|
|
// Add the (possibly multiple) personalities to the set of global values.
|
|
|
|
// Only referenced functions get into the Personalities list.
|
|
|
|
for (const Function *Personality : MMI->getPersonalities()) {
|
|
|
|
if (Personality) {
|
|
|
|
MCSymbol *NLPSym =
|
|
|
|
getSymbolWithGlobalValueBase(Personality, "$non_lazy_ptr");
|
|
|
|
MachineModuleInfoImpl::StubValueTy &StubSym =
|
|
|
|
MMIMacho.getGVStubEntry(NLPSym);
|
|
|
|
StubSym =
|
|
|
|
MachineModuleInfoImpl::StubValueTy(getSymbol(Personality), true);
|
|
|
|
}
|
2010-01-21 05:16:14 +08:00
|
|
|
}
|
2009-07-15 09:14:44 +08:00
|
|
|
}
|
2007-11-21 07:24:42 +08:00
|
|
|
|
2016-11-04 07:33:46 +08:00
|
|
|
// Output stubs for dynamically-linked functions.
|
|
|
|
MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList();
|
|
|
|
|
|
|
|
// Output macho stubs for external and common global variables.
|
|
|
|
if (!Stubs.empty()) {
|
|
|
|
// Switch with ".non_lazy_symbol_pointer" directive.
|
|
|
|
OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
|
|
|
|
EmitAlignment(isPPC64 ? 3 : 2);
|
|
|
|
|
|
|
|
for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
|
|
|
|
// L_foo$stub:
|
|
|
|
OutStreamer->EmitLabel(Stubs[i].first);
|
|
|
|
// .indirect_symbol _foo
|
|
|
|
MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
|
|
|
|
OutStreamer->EmitSymbolAttribute(MCSym.getPointer(),
|
|
|
|
MCSA_IndirectSymbol);
|
|
|
|
|
|
|
|
if (MCSym.getInt())
|
|
|
|
// External to current translation unit.
|
|
|
|
OutStreamer->EmitIntValue(0, isPPC64 ? 8 : 4 /*size*/);
|
|
|
|
else
|
|
|
|
// Internal to current translation unit.
|
|
|
|
//
|
|
|
|
// When we place the LSDA into the TEXT section, the type info
|
|
|
|
// pointers
|
|
|
|
// need to be indirect and pc-rel. We accomplish this by using NLPs.
|
|
|
|
// However, sometimes the types are local to the file. So we need to
|
|
|
|
// fill in the value for the NLP in those cases.
|
|
|
|
OutStreamer->EmitValue(
|
|
|
|
MCSymbolRefExpr::create(MCSym.getPointer(), OutContext),
|
|
|
|
isPPC64 ? 8 : 4 /*size*/);
|
|
|
|
}
|
2010-03-12 07:39:44 +08:00
|
|
|
|
2016-11-04 07:33:46 +08:00
|
|
|
Stubs.clear();
|
|
|
|
OutStreamer->AddBlankLine();
|
|
|
|
}
|
2004-08-15 06:09:10 +08:00
|
|
|
}
|
2005-04-22 07:30:14 +08:00
|
|
|
|
2005-11-01 08:12:36 +08:00
|
|
|
// Funny Darwin hack: This flag tells the linker that no global symbols
|
|
|
|
// contain code that falls through to other global symbols (e.g. the obvious
|
|
|
|
// implementation of multiple entry points). If this doesn't occur, the
|
|
|
|
// linker can safely perform dead code stripping. Since LLVM never generates
|
|
|
|
// code that does this, it is always safe to set.
|
2015-04-25 03:11:51 +08:00
|
|
|
OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
|
2005-11-01 08:12:36 +08:00
|
|
|
|
2007-07-26 03:33:14 +08:00
|
|
|
return AsmPrinter::doFinalization(M);
|
2004-06-22 00:55:25 +08:00
|
|
|
}
|
2004-09-04 13:00:00 +08:00
|
|
|
|
2006-12-21 04:56:46 +08:00
|
|
|
/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
|
|
|
|
/// for a MachineFunction to the given output stream, in a format that the
|
2006-09-21 01:12:19 +08:00
|
|
|
/// Darwin assembler can deal with.
|
|
|
|
///
|
2015-01-19 04:29:04 +08:00
|
|
|
static AsmPrinter *
|
|
|
|
createPPCAsmPrinterPass(TargetMachine &tm,
|
|
|
|
std::unique_ptr<MCStreamer> &&Streamer) {
|
2015-06-16 23:44:21 +08:00
|
|
|
if (tm.getTargetTriple().isMacOSX())
|
2015-01-19 04:29:04 +08:00
|
|
|
return new PPCDarwinAsmPrinter(tm, std::move(Streamer));
|
|
|
|
return new PPCLinuxAsmPrinter(tm, std::move(Streamer));
|
2006-09-21 01:12:19 +08:00
|
|
|
}
|
2008-08-17 21:54:28 +08:00
|
|
|
|
2009-06-24 07:59:40 +08:00
|
|
|
// Force static initialization.
|
2015-09-22 19:19:03 +08:00
|
|
|
extern "C" void LLVMInitializePowerPCAsmPrinter() {
|
2016-10-10 07:00:34 +08:00
|
|
|
TargetRegistry::RegisterAsmPrinter(getThePPC32Target(),
|
|
|
|
createPPCAsmPrinterPass);
|
|
|
|
TargetRegistry::RegisterAsmPrinter(getThePPC64Target(),
|
|
|
|
createPPCAsmPrinterPass);
|
|
|
|
TargetRegistry::RegisterAsmPrinter(getThePPC64LETarget(),
|
|
|
|
createPPCAsmPrinterPass);
|
2009-07-16 04:24:03 +08:00
|
|
|
}
|