2012-02-18 20:03:15 +08:00
|
|
|
//===-- PPCAsmPrinter.cpp - Print machine instrs to PowerPC assembly ------===//
|
2005-04-22 07:30:14 +08:00
|
|
|
//
|
2004-06-22 00:55:25 +08:00
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 04:36:04 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2005-04-22 07:30:14 +08:00
|
|
|
//
|
2004-06-22 00:55:25 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2004-07-09 01:58:04 +08:00
|
|
|
// This file contains a printer that converts from our internal representation
|
|
|
|
// of machine-dependent LLVM code to PowerPC assembly language. This printer is
|
2004-07-29 04:18:53 +08:00
|
|
|
// the output mechanism used by `llc'.
|
2004-06-22 00:55:25 +08:00
|
|
|
//
|
2004-07-09 01:58:04 +08:00
|
|
|
// Documentation at http://developer.apple.com/documentation/DeveloperTools/
|
|
|
|
// Reference/Assembler/ASMIntroduction/chapter_1_section_1.html
|
2004-06-30 01:13:26 +08:00
|
|
|
//
|
2004-06-22 00:55:25 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2005-10-15 07:51:18 +08:00
|
|
|
#include "PPC.h"
|
2012-03-18 02:46:09 +08:00
|
|
|
#include "InstPrinter/PPCInstPrinter.h"
|
2014-07-19 07:29:49 +08:00
|
|
|
#include "PPCMachineFunctionInfo.h"
|
2013-05-24 06:26:41 +08:00
|
|
|
#include "MCTargetDesc/PPCMCExpr.h"
|
2014-01-07 19:48:04 +08:00
|
|
|
#include "MCTargetDesc/PPCPredicates.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "PPCSubtarget.h"
|
|
|
|
#include "PPCTargetMachine.h"
|
2013-10-08 21:08:17 +08:00
|
|
|
#include "PPCTargetStreamer.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/ADT/MapVector.h"
|
|
|
|
#include "llvm/ADT/SmallString.h"
|
|
|
|
#include "llvm/ADT/StringExtras.h"
|
2004-08-17 07:25:21 +08:00
|
|
|
#include "llvm/CodeGen/AsmPrinter.h"
|
2014-07-19 07:29:49 +08:00
|
|
|
#include "llvm/CodeGen/MachineConstantPool.h"
|
2004-06-25 01:31:42 +08:00
|
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
2004-06-22 00:55:25 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstr.h"
|
2008-01-26 14:51:24 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
2010-01-21 05:16:14 +08:00
|
|
|
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2010-02-16 06:37:53 +08:00
|
|
|
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/Constants.h"
|
2014-03-06 08:46:21 +08:00
|
|
|
#include "llvm/IR/DebugInfo.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/DerivedTypes.h"
|
2014-01-08 05:19:40 +08:00
|
|
|
#include "llvm/IR/Mangler.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/Module.h"
|
2009-09-14 01:14:04 +08:00
|
|
|
#include "llvm/MC/MCAsmInfo.h"
|
2010-01-14 03:00:57 +08:00
|
|
|
#include "llvm/MC/MCContext.h"
|
2010-03-12 07:39:44 +08:00
|
|
|
#include "llvm/MC/MCExpr.h"
|
2010-11-15 03:53:02 +08:00
|
|
|
#include "llvm/MC/MCInst.h"
|
2012-11-26 21:34:22 +08:00
|
|
|
#include "llvm/MC/MCInstBuilder.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/MC/MCSectionELF.h"
|
2009-08-11 02:15:01 +08:00
|
|
|
#include "llvm/MC/MCSectionMachO.h"
|
2009-08-19 13:49:37 +08:00
|
|
|
#include "llvm/MC/MCStreamer.h"
|
2009-09-14 01:14:04 +08:00
|
|
|
#include "llvm/MC/MCSymbol.h"
|
2010-11-15 03:53:02 +08:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
2010-08-05 06:07:50 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Support/ELF.h"
|
2009-07-09 04:53:28 +08:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Support/MathExtras.h"
|
2011-08-25 02:08:43 +08:00
|
|
|
#include "llvm/Support/TargetRegistry.h"
|
2010-04-04 16:18:47 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Target/TargetInstrInfo.h"
|
|
|
|
#include "llvm/Target/TargetOptions.h"
|
|
|
|
#include "llvm/Target/TargetRegisterInfo.h"
|
2004-08-17 07:25:21 +08:00
|
|
|
using namespace llvm;
|
2004-06-22 00:55:25 +08:00
|
|
|
|
2014-04-22 10:41:26 +08:00
|
|
|
#define DEBUG_TYPE "asmprinter"
|
|
|
|
|
2006-12-20 06:59:26 +08:00
|
|
|
namespace {
|
2009-10-25 14:33:48 +08:00
|
|
|
class PPCAsmPrinter : public AsmPrinter {
|
2009-02-24 16:30:20 +08:00
|
|
|
protected:
|
2012-11-13 03:13:24 +08:00
|
|
|
MapVector<MCSymbol*, MCSymbol*> TOC;
|
2006-09-21 01:07:15 +08:00
|
|
|
const PPCSubtarget &Subtarget;
|
2010-11-15 11:42:54 +08:00
|
|
|
uint64_t TOCLabelID;
|
2009-02-24 16:30:20 +08:00
|
|
|
public:
|
2010-04-04 16:18:47 +08:00
|
|
|
explicit PPCAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
|
|
|
|
: AsmPrinter(TM, Streamer),
|
2010-11-15 11:42:54 +08:00
|
|
|
Subtarget(TM.getSubtarget<PPCSubtarget>()), TOCLabelID(0) {}
|
2005-04-22 07:30:14 +08:00
|
|
|
|
2014-04-29 15:57:37 +08:00
|
|
|
const char *getPassName() const override {
|
2004-09-04 13:00:00 +08:00
|
|
|
return "PowerPC Assembly Printer";
|
2004-06-22 00:55:25 +08:00
|
|
|
}
|
|
|
|
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym);
|
2004-08-15 06:09:10 +08:00
|
|
|
|
2014-04-29 15:57:37 +08:00
|
|
|
void EmitInstruction(const MachineInstr *MI) override;
|
2004-08-15 07:27:29 +08:00
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2006-02-02 06:38:46 +08:00
|
|
|
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
|
2010-04-04 13:29:35 +08:00
|
|
|
unsigned AsmVariant, const char *ExtraCode,
|
2014-04-29 15:57:37 +08:00
|
|
|
raw_ostream &O) override;
|
2006-02-25 04:27:40 +08:00
|
|
|
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
|
2010-04-04 13:29:35 +08:00
|
|
|
unsigned AsmVariant, const char *ExtraCode,
|
2014-04-29 15:57:37 +08:00
|
|
|
raw_ostream &O) override;
|
2004-09-04 13:00:00 +08:00
|
|
|
};
|
2005-04-22 07:30:14 +08:00
|
|
|
|
2008-08-09 02:22:59 +08:00
|
|
|
/// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
|
2009-10-25 14:33:48 +08:00
|
|
|
class PPCLinuxAsmPrinter : public PPCAsmPrinter {
|
2009-02-24 16:30:20 +08:00
|
|
|
public:
|
2010-04-04 16:18:47 +08:00
|
|
|
explicit PPCLinuxAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
|
|
|
|
: PPCAsmPrinter(TM, Streamer) {}
|
2006-12-22 04:26:09 +08:00
|
|
|
|
2014-04-29 15:57:37 +08:00
|
|
|
const char *getPassName() const override {
|
2006-12-22 04:26:09 +08:00
|
|
|
return "Linux PPC Assembly Printer";
|
|
|
|
}
|
|
|
|
|
2014-04-29 15:57:37 +08:00
|
|
|
bool doFinalization(Module &M) override;
|
2014-07-19 07:29:49 +08:00
|
|
|
void EmitStartOfAsmFile(Module &M) override;
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2014-04-29 15:57:37 +08:00
|
|
|
void EmitFunctionEntryLabel() override;
|
2012-08-29 03:06:55 +08:00
|
|
|
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
void EmitFunctionBodyStart() override;
|
2014-04-29 15:57:37 +08:00
|
|
|
void EmitFunctionBodyEnd() override;
|
2006-12-22 04:26:09 +08:00
|
|
|
};
|
|
|
|
|
2008-08-09 02:22:59 +08:00
|
|
|
/// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
|
|
|
|
/// OS X
|
2009-10-25 14:33:48 +08:00
|
|
|
class PPCDarwinAsmPrinter : public PPCAsmPrinter {
|
2009-02-24 16:30:20 +08:00
|
|
|
public:
|
2010-04-04 16:18:47 +08:00
|
|
|
explicit PPCDarwinAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
|
|
|
|
: PPCAsmPrinter(TM, Streamer) {}
|
2004-09-04 13:00:00 +08:00
|
|
|
|
2014-04-29 15:57:37 +08:00
|
|
|
const char *getPassName() const override {
|
2004-09-04 13:00:00 +08:00
|
|
|
return "Darwin PPC Assembly Printer";
|
|
|
|
}
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2014-04-29 15:57:37 +08:00
|
|
|
bool doFinalization(Module &M) override;
|
|
|
|
void EmitStartOfAsmFile(Module &M) override;
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2010-01-21 05:36:48 +08:00
|
|
|
void EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs);
|
2004-06-22 00:55:25 +08:00
|
|
|
};
|
|
|
|
} // end of anonymous namespace
|
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
/// stripRegisterPrefix - This method strips the character prefix from a
|
|
|
|
/// register name so that only the number is left. Used by for linux asm.
|
|
|
|
static const char *stripRegisterPrefix(const char *RegName) {
|
|
|
|
switch (RegName[0]) {
|
|
|
|
case 'r':
|
|
|
|
case 'f':
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
case 'v':
|
|
|
|
if (RegName[1] == 's')
|
|
|
|
return RegName + 2;
|
|
|
|
return RegName + 1;
|
2010-11-15 11:39:06 +08:00
|
|
|
case 'c': if (RegName[1] == 'r') return RegName + 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
return RegName;
|
|
|
|
}
|
2004-09-04 13:00:00 +08:00
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
|
|
|
|
raw_ostream &O) {
|
2014-01-04 03:21:54 +08:00
|
|
|
const DataLayout *DL = TM.getDataLayout();
|
2010-11-15 11:39:06 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(OpNo);
|
|
|
|
|
2004-06-22 00:55:25 +08:00
|
|
|
switch (MO.getType()) {
|
2010-11-15 11:39:06 +08:00
|
|
|
case MachineOperand::MO_Register: {
|
|
|
|
const char *RegName = PPCInstPrinter::getRegisterName(MO.getReg());
|
|
|
|
// Linux assembler (Others?) does not take register mnemonics.
|
|
|
|
// FIXME - What about special registers used in mfspr/mtspr?
|
|
|
|
if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName);
|
|
|
|
O << RegName;
|
|
|
|
return;
|
|
|
|
}
|
2006-05-05 01:21:20 +08:00
|
|
|
case MachineOperand::MO_Immediate:
|
2010-11-15 11:39:06 +08:00
|
|
|
O << MO.getImm();
|
|
|
|
return;
|
2004-07-28 08:00:48 +08:00
|
|
|
|
2006-04-23 02:53:45 +08:00
|
|
|
case MachineOperand::MO_MachineBasicBlock:
|
2010-03-14 05:04:28 +08:00
|
|
|
O << *MO.getMBB()->getSymbol();
|
2006-04-23 02:53:45 +08:00
|
|
|
return;
|
2004-07-09 01:58:04 +08:00
|
|
|
case MachineOperand::MO_ConstantPoolIndex:
|
2014-01-04 03:21:54 +08:00
|
|
|
O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
|
2007-12-31 07:10:15 +08:00
|
|
|
<< '_' << MO.getIndex();
|
2004-06-22 00:55:25 +08:00
|
|
|
return;
|
2009-11-05 05:31:18 +08:00
|
|
|
case MachineOperand::MO_BlockAddress:
|
2010-01-18 05:43:43 +08:00
|
|
|
O << *GetBlockAddressSymbol(MO.getBlockAddress());
|
2009-11-05 05:31:18 +08:00
|
|
|
return;
|
2004-08-13 17:32:01 +08:00
|
|
|
case MachineOperand::MO_GlobalAddress: {
|
2005-12-16 08:22:14 +08:00
|
|
|
// Computing the address of a global symbol, not calling it.
|
2010-04-15 09:51:59 +08:00
|
|
|
const GlobalValue *GV = MO.getGlobal();
|
2010-01-16 10:00:23 +08:00
|
|
|
MCSymbol *SymToPrint;
|
2004-08-13 17:32:01 +08:00
|
|
|
|
2004-10-18 07:01:34 +08:00
|
|
|
// External or weakly linked global variables need non-lazily-resolved stubs
|
2009-07-15 09:14:44 +08:00
|
|
|
if (TM.getRelocationModel() != Reloc::Static &&
|
|
|
|
(GV->isDeclaration() || GV->isWeakForLinker())) {
|
|
|
|
if (!GV->hasHiddenVisibility()) {
|
2013-12-03 00:25:47 +08:00
|
|
|
SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
|
2010-03-11 06:34:10 +08:00
|
|
|
MachineModuleInfoImpl::StubValueTy &StubSym =
|
|
|
|
MMI->getObjFileInfo<MachineModuleInfoMachO>()
|
|
|
|
.getGVStubEntry(SymToPrint);
|
2014-04-25 13:30:21 +08:00
|
|
|
if (!StubSym.getPointer())
|
2010-03-11 06:34:10 +08:00
|
|
|
StubSym = MachineModuleInfoImpl::
|
2013-10-30 01:07:16 +08:00
|
|
|
StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
|
2009-07-15 09:14:44 +08:00
|
|
|
} else if (GV->isDeclaration() || GV->hasCommonLinkage() ||
|
|
|
|
GV->hasAvailableExternallyLinkage()) {
|
2013-12-03 00:25:47 +08:00
|
|
|
SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
|
2010-01-21 05:16:14 +08:00
|
|
|
|
2010-03-11 06:34:10 +08:00
|
|
|
MachineModuleInfoImpl::StubValueTy &StubSym =
|
2010-01-21 05:16:14 +08:00
|
|
|
MMI->getObjFileInfo<MachineModuleInfoMachO>().
|
|
|
|
getHiddenGVStubEntry(SymToPrint);
|
2014-04-25 13:30:21 +08:00
|
|
|
if (!StubSym.getPointer())
|
2010-03-11 06:34:10 +08:00
|
|
|
StubSym = MachineModuleInfoImpl::
|
2013-10-30 01:07:16 +08:00
|
|
|
StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
|
2009-07-15 09:14:44 +08:00
|
|
|
} else {
|
2013-10-30 01:07:16 +08:00
|
|
|
SymToPrint = getSymbol(GV);
|
2005-12-16 08:22:14 +08:00
|
|
|
}
|
2009-07-15 09:14:44 +08:00
|
|
|
} else {
|
2013-10-30 01:07:16 +08:00
|
|
|
SymToPrint = getSymbol(GV);
|
2004-06-22 00:55:25 +08:00
|
|
|
}
|
2010-01-16 10:00:23 +08:00
|
|
|
|
2010-01-18 05:43:43 +08:00
|
|
|
O << *SymToPrint;
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2010-04-04 06:28:33 +08:00
|
|
|
printOffset(MO.getOffset(), O);
|
2004-06-22 00:55:25 +08:00
|
|
|
return;
|
2004-08-13 17:32:01 +08:00
|
|
|
}
|
2005-04-22 07:30:14 +08:00
|
|
|
|
2004-06-22 00:55:25 +08:00
|
|
|
default:
|
2014-05-17 07:28:17 +08:00
|
|
|
O << "<unknown operand type: " << (unsigned)MO.getType() << ">";
|
2004-06-25 23:11:34 +08:00
|
|
|
return;
|
2004-06-22 00:55:25 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-02-24 03:31:10 +08:00
|
|
|
/// PrintAsmOperand - Print out an operand for an inline asm expression.
|
|
|
|
///
|
|
|
|
bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
|
2008-08-09 02:22:59 +08:00
|
|
|
unsigned AsmVariant,
|
2010-04-04 13:29:35 +08:00
|
|
|
const char *ExtraCode, raw_ostream &O) {
|
2006-02-24 03:31:10 +08:00
|
|
|
// Does this asm operand have a single letter operand modifier?
|
|
|
|
if (ExtraCode && ExtraCode[0]) {
|
|
|
|
if (ExtraCode[1] != 0) return true; // Unknown modifier.
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2006-02-24 03:31:10 +08:00
|
|
|
switch (ExtraCode[0]) {
|
2012-06-26 21:49:27 +08:00
|
|
|
default:
|
|
|
|
// See if this is a generic print operand
|
|
|
|
return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
|
2007-01-25 10:52:50 +08:00
|
|
|
case 'c': // Don't print "$" before a global var name or constant.
|
2010-11-15 11:39:06 +08:00
|
|
|
break; // PPC never has a prefix.
|
2008-08-09 02:22:59 +08:00
|
|
|
case 'L': // Write second word of DImode reference.
|
2006-02-24 03:31:10 +08:00
|
|
|
// Verify that this operand has two consecutive registers.
|
2008-10-03 23:45:36 +08:00
|
|
|
if (!MI->getOperand(OpNo).isReg() ||
|
2006-02-24 03:31:10 +08:00
|
|
|
OpNo+1 == MI->getNumOperands() ||
|
2008-10-03 23:45:36 +08:00
|
|
|
!MI->getOperand(OpNo+1).isReg())
|
2006-02-24 03:31:10 +08:00
|
|
|
return true;
|
|
|
|
++OpNo; // Return the high-part.
|
|
|
|
break;
|
2007-04-25 06:51:03 +08:00
|
|
|
case 'I':
|
|
|
|
// Write 'i' if an integer constant, otherwise nothing. Used to print
|
|
|
|
// addi vs add, etc.
|
2008-10-03 23:45:36 +08:00
|
|
|
if (MI->getOperand(OpNo).isImm())
|
2007-04-25 06:51:03 +08:00
|
|
|
O << "i";
|
|
|
|
return false;
|
2006-02-24 03:31:10 +08:00
|
|
|
}
|
|
|
|
}
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2010-04-04 12:47:45 +08:00
|
|
|
printOperand(MI, OpNo, O);
|
2006-02-24 03:31:10 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2009-08-18 08:18:39 +08:00
|
|
|
// At the moment, all inline asm memory operands are a single register.
|
|
|
|
// In any case, the output of this routine should always be just one
|
|
|
|
// assembler operand.
|
|
|
|
|
2006-02-25 04:27:40 +08:00
|
|
|
bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
|
2008-08-09 02:22:59 +08:00
|
|
|
unsigned AsmVariant,
|
2010-04-04 13:29:35 +08:00
|
|
|
const char *ExtraCode,
|
|
|
|
raw_ostream &O) {
|
2012-11-06 02:18:42 +08:00
|
|
|
if (ExtraCode && ExtraCode[0]) {
|
|
|
|
if (ExtraCode[1] != 0) return true; // Unknown modifier.
|
|
|
|
|
|
|
|
switch (ExtraCode[0]) {
|
|
|
|
default: return true; // Unknown modifier.
|
|
|
|
case 'y': // A memory reference for an X-form instruction
|
|
|
|
{
|
|
|
|
const char *RegName = "r0";
|
|
|
|
if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName);
|
|
|
|
O << RegName << ", ";
|
|
|
|
printOperand(MI, OpNo, O);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
assert(MI->getOperand(OpNo).isReg());
|
2009-08-27 02:10:32 +08:00
|
|
|
O << "0(";
|
2010-04-04 12:47:45 +08:00
|
|
|
printOperand(MI, OpNo, O);
|
2009-08-27 02:10:32 +08:00
|
|
|
O << ")";
|
2006-02-25 04:27:40 +08:00
|
|
|
return false;
|
|
|
|
}
|
2006-02-24 03:31:10 +08:00
|
|
|
|
2006-11-04 13:27:39 +08:00
|
|
|
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
/// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry
|
|
|
|
/// exists for it. If not, create one. Then return a symbol that references
|
|
|
|
/// the TOC entry.
|
|
|
|
MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
|
2014-01-04 03:21:54 +08:00
|
|
|
const DataLayout *DL = TM.getDataLayout();
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
MCSymbol *&TOCEntry = TOC[Sym];
|
|
|
|
|
|
|
|
// To avoid name clash check if the name already exists.
|
2014-04-25 13:30:21 +08:00
|
|
|
while (!TOCEntry) {
|
2014-01-04 03:21:54 +08:00
|
|
|
if (OutContext.LookupSymbol(Twine(DL->getPrivateGlobalPrefix()) +
|
2014-04-25 13:30:21 +08:00
|
|
|
"C" + Twine(TOCLabelID++)) == nullptr) {
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
TOCEntry = GetTempSymbol("C", TOCLabelID);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return TOCEntry;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-01-28 09:28:58 +08:00
|
|
|
/// EmitInstruction -- Print out a single PowerPC MI in Darwin syntax to
|
2004-08-15 06:09:10 +08:00
|
|
|
/// the current output stream.
|
2004-06-22 00:55:25 +08:00
|
|
|
///
|
2010-01-28 09:28:58 +08:00
|
|
|
void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
|
2010-11-15 11:39:06 +08:00
|
|
|
MCInst TmpInst;
|
2013-12-21 02:08:54 +08:00
|
|
|
bool isPPC64 = Subtarget.isPPC64();
|
2010-11-15 11:39:06 +08:00
|
|
|
|
|
|
|
// Lower multi-instruction pseudo operations.
|
|
|
|
switch (MI->getOpcode()) {
|
|
|
|
default: break;
|
2013-06-17 04:34:27 +08:00
|
|
|
case TargetOpcode::DBG_VALUE:
|
|
|
|
llvm_unreachable("Should be handled target independently");
|
2010-11-15 11:39:06 +08:00
|
|
|
case PPC::MovePCtoLR:
|
|
|
|
case PPC::MovePCtoLR8: {
|
|
|
|
// Transform %LR = MovePCtoLR
|
|
|
|
// Into this, where the label is the PIC base:
|
|
|
|
// bl L1$pb
|
|
|
|
// L1$pb:
|
|
|
|
MCSymbol *PICBase = MF->getPICBaseSymbol();
|
2010-11-15 06:03:15 +08:00
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
// Emit the 'bl'.
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL)
|
2012-11-26 21:34:22 +08:00
|
|
|
// FIXME: We would like an efficient form for this, so we don't have to do
|
|
|
|
// a lot of extra uniquing.
|
2012-11-27 02:05:52 +08:00
|
|
|
.addExpr(MCSymbolRefExpr::Create(PICBase, OutContext)));
|
2010-11-15 11:39:06 +08:00
|
|
|
|
|
|
|
// Emit the label.
|
|
|
|
OutStreamer.EmitLabel(PICBase);
|
|
|
|
return;
|
|
|
|
}
|
2014-07-19 07:29:49 +08:00
|
|
|
case PPC::GetGBRO: {
|
|
|
|
// Get the offset from the GOT Base Register to the GOT
|
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
|
|
|
|
MCSymbol *PICOffset = MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol();
|
|
|
|
TmpInst.setOpcode(PPC::LWZ);
|
|
|
|
const MCExpr *Exp =
|
|
|
|
MCSymbolRefExpr::Create(PICOffset, MCSymbolRefExpr::VK_None, OutContext);
|
|
|
|
const MCExpr *PB =
|
|
|
|
MCSymbolRefExpr::Create(MF->getPICBaseSymbol(),
|
|
|
|
MCSymbolRefExpr::VK_None,
|
|
|
|
OutContext);
|
|
|
|
const MCOperand MO = TmpInst.getOperand(1);
|
|
|
|
TmpInst.getOperand(1) = MCOperand::CreateExpr(MCBinaryExpr::CreateSub(Exp,
|
|
|
|
PB,
|
|
|
|
OutContext));
|
|
|
|
TmpInst.addOperand(MO);
|
|
|
|
EmitToStreamer(OutStreamer, TmpInst);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
case PPC::UpdateGBR: {
|
|
|
|
// Update the GOT Base Register to point to the GOT. It may be possible to
|
|
|
|
// merge this with the PPC::GetGBRO, doing it all in one step.
|
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
|
|
|
|
TmpInst.setOpcode(PPC::ADD4);
|
|
|
|
TmpInst.addOperand(TmpInst.getOperand(0));
|
|
|
|
EmitToStreamer(OutStreamer, TmpInst);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
case PPC::LWZtoc: {
|
|
|
|
// Transform %X3 = LWZtoc <ga:@min1>, %X2
|
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
|
|
|
|
|
|
|
|
// Change the opcode to LWZ, and the global address operand to be a
|
|
|
|
// reference to the GOT entry we will synthesize later.
|
|
|
|
TmpInst.setOpcode(PPC::LWZ);
|
|
|
|
const MachineOperand &MO = MI->getOperand(1);
|
|
|
|
|
|
|
|
// Map symbol -> label of TOC entry
|
|
|
|
assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
|
|
|
|
MCSymbol *MOSymbol = nullptr;
|
|
|
|
if (MO.isGlobal())
|
|
|
|
MOSymbol = getSymbol(MO.getGlobal());
|
|
|
|
else if (MO.isCPI())
|
|
|
|
MOSymbol = GetCPISymbol(MO.getIndex());
|
|
|
|
else if (MO.isJTI())
|
|
|
|
MOSymbol = GetJTISymbol(MO.getIndex());
|
|
|
|
|
|
|
|
MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
|
|
|
|
|
|
|
|
const MCExpr *Exp =
|
|
|
|
MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_None,
|
|
|
|
OutContext);
|
|
|
|
const MCExpr *PB =
|
|
|
|
MCSymbolRefExpr::Create(OutContext.GetOrCreateSymbol(Twine(".L.TOC.")),
|
|
|
|
OutContext);
|
|
|
|
Exp = MCBinaryExpr::CreateSub(Exp, PB, OutContext);
|
|
|
|
TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
|
|
|
|
EmitToStreamer(OutStreamer, TmpInst);
|
|
|
|
return;
|
|
|
|
}
|
2012-08-25 00:26:02 +08:00
|
|
|
case PPC::LDtocJTI:
|
|
|
|
case PPC::LDtocCPT:
|
2010-11-15 11:39:06 +08:00
|
|
|
case PPC::LDtoc: {
|
|
|
|
// Transform %X3 = LDtoc <ga:@min1>, %X2
|
[PowerPC] Always use "assembler dialect" 1
A setting in MCAsmInfo defines the "assembler dialect" to use. This is used
by common code to choose between alternatives in a multi-alternative GNU
inline asm statement like the following:
__asm__ ("{sfe|subfe} %0,%1,%2" : "=r" (out) : "r" (in1), "r" (in2));
The meaning of these dialects is platform specific, and GCC defines those
for PowerPC to use dialect 0 for old-style (POWER) mnemonics and 1 for
new-style (PowerPC) mnemonics, like in the example above.
To be compatible with inline asm used with GCC, LLVM ought to do the same.
Specifically, this means we should always use assembler dialect 1 since
old-style mnemonics really aren't supported on any current platform.
However, the current LLVM back-end uses:
AssemblerDialect = 1; // New-Style mnemonics.
in PPCMCAsmInfoDarwin, and
AssemblerDialect = 0; // Old-Style mnemonics.
in PPCLinuxMCAsmInfo.
The Linux setting really isn't correct, we should be using new-style
mnemonics everywhere. This is changed by this commit.
Unfortunately, the setting of this variable is overloaded in the back-end
to decide whether or not we are on a Darwin target. This is done in
PPCInstPrinter (the "SyntaxVariant" is initialized from the MCAsmInfo
AssemblerDialect setting), and also in PPCMCExpr. Setting AssemblerDialect
to 1 for both Darwin and Linux no longer allows us to make this distinction.
Instead, this patch uses the MCSubtargetInfo passed to createPPCMCInstPrinter
to distinguish Darwin targets, and ignores the SyntaxVariant parameter.
As to PPCMCExpr, this patch adds an explicit isDarwin argument that needs
to be passed in by the caller when creating a target MCExpr. (To do so
this patch implicitly also reverts commit 184441.)
llvm-svn: 185858
2013-07-09 04:20:51 +08:00
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
|
2012-08-25 00:26:02 +08:00
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
// Change the opcode to LD, and the global address operand to be a
|
|
|
|
// reference to the TOC entry we will synthesize later.
|
|
|
|
TmpInst.setOpcode(PPC::LD);
|
|
|
|
const MachineOperand &MO = MI->getOperand(1);
|
2012-08-25 00:26:02 +08:00
|
|
|
|
|
|
|
// Map symbol -> label of TOC entry
|
|
|
|
assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
|
2014-04-25 13:30:21 +08:00
|
|
|
MCSymbol *MOSymbol = nullptr;
|
2012-08-25 00:26:02 +08:00
|
|
|
if (MO.isGlobal())
|
2013-10-30 01:07:16 +08:00
|
|
|
MOSymbol = getSymbol(MO.getGlobal());
|
2012-08-25 00:26:02 +08:00
|
|
|
else if (MO.isCPI())
|
|
|
|
MOSymbol = GetCPISymbol(MO.getIndex());
|
|
|
|
else if (MO.isJTI())
|
|
|
|
MOSymbol = GetJTISymbol(MO.getIndex());
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
|
|
|
MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
|
2012-09-19 01:10:37 +08:00
|
|
|
|
2010-11-15 11:39:06 +08:00
|
|
|
const MCExpr *Exp =
|
2013-06-21 22:42:20 +08:00
|
|
|
MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC,
|
2010-11-15 11:39:06 +08:00
|
|
|
OutContext);
|
|
|
|
TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, TmpInst);
|
2010-11-15 03:53:02 +08:00
|
|
|
return;
|
|
|
|
}
|
2010-11-15 11:39:06 +08:00
|
|
|
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
case PPC::ADDIStocHA: {
|
|
|
|
// Transform %Xd = ADDIStocHA %X2, <ga:@sym>
|
[PowerPC] Always use "assembler dialect" 1
A setting in MCAsmInfo defines the "assembler dialect" to use. This is used
by common code to choose between alternatives in a multi-alternative GNU
inline asm statement like the following:
__asm__ ("{sfe|subfe} %0,%1,%2" : "=r" (out) : "r" (in1), "r" (in2));
The meaning of these dialects is platform specific, and GCC defines those
for PowerPC to use dialect 0 for old-style (POWER) mnemonics and 1 for
new-style (PowerPC) mnemonics, like in the example above.
To be compatible with inline asm used with GCC, LLVM ought to do the same.
Specifically, this means we should always use assembler dialect 1 since
old-style mnemonics really aren't supported on any current platform.
However, the current LLVM back-end uses:
AssemblerDialect = 1; // New-Style mnemonics.
in PPCMCAsmInfoDarwin, and
AssemblerDialect = 0; // Old-Style mnemonics.
in PPCLinuxMCAsmInfo.
The Linux setting really isn't correct, we should be using new-style
mnemonics everywhere. This is changed by this commit.
Unfortunately, the setting of this variable is overloaded in the back-end
to decide whether or not we are on a Darwin target. This is done in
PPCInstPrinter (the "SyntaxVariant" is initialized from the MCAsmInfo
AssemblerDialect setting), and also in PPCMCExpr. Setting AssemblerDialect
to 1 for both Darwin and Linux no longer allows us to make this distinction.
Instead, this patch uses the MCSubtargetInfo passed to createPPCMCInstPrinter
to distinguish Darwin targets, and ignores the SyntaxVariant parameter.
As to PPCMCExpr, this patch adds an explicit isDarwin argument that needs
to be passed in by the caller when creating a target MCExpr. (To do so
this patch implicitly also reverts commit 184441.)
llvm-svn: 185858
2013-07-09 04:20:51 +08:00
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
2014-06-17 05:36:02 +08:00
|
|
|
// Change the opcode to ADDIS8. If the global address is external, has
|
|
|
|
// common linkage, is a non-local function address, or is a jump table
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
// address, then generate a TOC entry and reference that. Otherwise
|
|
|
|
// reference the symbol directly.
|
|
|
|
TmpInst.setOpcode(PPC::ADDIS8);
|
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
assert((MO.isGlobal() || MO.isCPI() || MO.isJTI()) &&
|
|
|
|
"Invalid operand for ADDIStocHA!");
|
2014-04-25 13:30:21 +08:00
|
|
|
MCSymbol *MOSymbol = nullptr;
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
bool IsExternal = false;
|
2014-06-17 05:36:02 +08:00
|
|
|
bool IsNonLocalFunction = false;
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
bool IsCommon = false;
|
2013-01-08 03:29:18 +08:00
|
|
|
bool IsAvailExt = false;
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
|
|
|
if (MO.isGlobal()) {
|
2014-05-29 23:41:38 +08:00
|
|
|
const GlobalValue *GV = MO.getGlobal();
|
|
|
|
MOSymbol = getSymbol(GV);
|
|
|
|
IsExternal = GV->isDeclaration();
|
|
|
|
IsCommon = GV->hasCommonLinkage();
|
2014-06-17 05:36:02 +08:00
|
|
|
IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
|
|
|
|
(GV->isDeclaration() || GV->isWeakForLinker());
|
2014-05-29 23:41:38 +08:00
|
|
|
IsAvailExt = GV->hasAvailableExternallyLinkage();
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
} else if (MO.isCPI())
|
|
|
|
MOSymbol = GetCPISymbol(MO.getIndex());
|
|
|
|
else if (MO.isJTI())
|
|
|
|
MOSymbol = GetJTISymbol(MO.getIndex());
|
|
|
|
|
2014-06-17 05:36:02 +08:00
|
|
|
if (IsExternal || IsNonLocalFunction || IsCommon || IsAvailExt ||
|
|
|
|
MO.isJTI() || TM.getCodeModel() == CodeModel::Large)
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
|
|
|
|
|
|
|
|
const MCExpr *Exp =
|
2013-06-21 22:42:20 +08:00
|
|
|
MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_HA,
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
OutContext);
|
|
|
|
TmpInst.getOperand(2) = MCOperand::CreateExpr(Exp);
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, TmpInst);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
case PPC::LDtocL: {
|
|
|
|
// Transform %Xd = LDtocL <ga:@sym>, %Xs
|
[PowerPC] Always use "assembler dialect" 1
A setting in MCAsmInfo defines the "assembler dialect" to use. This is used
by common code to choose between alternatives in a multi-alternative GNU
inline asm statement like the following:
__asm__ ("{sfe|subfe} %0,%1,%2" : "=r" (out) : "r" (in1), "r" (in2));
The meaning of these dialects is platform specific, and GCC defines those
for PowerPC to use dialect 0 for old-style (POWER) mnemonics and 1 for
new-style (PowerPC) mnemonics, like in the example above.
To be compatible with inline asm used with GCC, LLVM ought to do the same.
Specifically, this means we should always use assembler dialect 1 since
old-style mnemonics really aren't supported on any current platform.
However, the current LLVM back-end uses:
AssemblerDialect = 1; // New-Style mnemonics.
in PPCMCAsmInfoDarwin, and
AssemblerDialect = 0; // Old-Style mnemonics.
in PPCLinuxMCAsmInfo.
The Linux setting really isn't correct, we should be using new-style
mnemonics everywhere. This is changed by this commit.
Unfortunately, the setting of this variable is overloaded in the back-end
to decide whether or not we are on a Darwin target. This is done in
PPCInstPrinter (the "SyntaxVariant" is initialized from the MCAsmInfo
AssemblerDialect setting), and also in PPCMCExpr. Setting AssemblerDialect
to 1 for both Darwin and Linux no longer allows us to make this distinction.
Instead, this patch uses the MCSubtargetInfo passed to createPPCMCInstPrinter
to distinguish Darwin targets, and ignores the SyntaxVariant parameter.
As to PPCMCExpr, this patch adds an explicit isDarwin argument that needs
to be passed in by the caller when creating a target MCExpr. (To do so
this patch implicitly also reverts commit 184441.)
llvm-svn: 185858
2013-07-09 04:20:51 +08:00
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
2013-03-26 18:55:45 +08:00
|
|
|
// Change the opcode to LD. If the global address is external, has
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
// common linkage, or is a jump table address, then reference the
|
|
|
|
// associated TOC entry. Otherwise reference the symbol directly.
|
2013-03-26 18:55:45 +08:00
|
|
|
TmpInst.setOpcode(PPC::LD);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(1);
|
2013-02-22 01:12:27 +08:00
|
|
|
assert((MO.isGlobal() || MO.isJTI() || MO.isCPI()) &&
|
|
|
|
"Invalid operand for LDtocL!");
|
2014-04-25 13:30:21 +08:00
|
|
|
MCSymbol *MOSymbol = nullptr;
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
|
|
|
if (MO.isJTI())
|
|
|
|
MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
|
2013-09-18 04:03:25 +08:00
|
|
|
else if (MO.isCPI()) {
|
2013-02-22 01:12:27 +08:00
|
|
|
MOSymbol = GetCPISymbol(MO.getIndex());
|
2013-09-18 04:03:25 +08:00
|
|
|
if (TM.getCodeModel() == CodeModel::Large)
|
|
|
|
MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
|
|
|
|
}
|
2013-02-22 01:12:27 +08:00
|
|
|
else if (MO.isGlobal()) {
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2014-05-29 23:41:38 +08:00
|
|
|
MOSymbol = getSymbol(GValue);
|
|
|
|
if (GValue->getType()->getElementType()->isFunctionTy() ||
|
|
|
|
GValue->isDeclaration() || GValue->hasCommonLinkage() ||
|
|
|
|
GValue->hasAvailableExternallyLinkage() ||
|
2013-09-18 04:03:25 +08:00
|
|
|
TM.getCodeModel() == CodeModel::Large)
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
|
|
|
|
}
|
|
|
|
|
|
|
|
const MCExpr *Exp =
|
2013-06-21 22:42:20 +08:00
|
|
|
MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
OutContext);
|
|
|
|
TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, TmpInst);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
case PPC::ADDItocL: {
|
|
|
|
// Transform %Xd = ADDItocL %Xs, <ga:@sym>
|
[PowerPC] Always use "assembler dialect" 1
A setting in MCAsmInfo defines the "assembler dialect" to use. This is used
by common code to choose between alternatives in a multi-alternative GNU
inline asm statement like the following:
__asm__ ("{sfe|subfe} %0,%1,%2" : "=r" (out) : "r" (in1), "r" (in2));
The meaning of these dialects is platform specific, and GCC defines those
for PowerPC to use dialect 0 for old-style (POWER) mnemonics and 1 for
new-style (PowerPC) mnemonics, like in the example above.
To be compatible with inline asm used with GCC, LLVM ought to do the same.
Specifically, this means we should always use assembler dialect 1 since
old-style mnemonics really aren't supported on any current platform.
However, the current LLVM back-end uses:
AssemblerDialect = 1; // New-Style mnemonics.
in PPCMCAsmInfoDarwin, and
AssemblerDialect = 0; // Old-Style mnemonics.
in PPCLinuxMCAsmInfo.
The Linux setting really isn't correct, we should be using new-style
mnemonics everywhere. This is changed by this commit.
Unfortunately, the setting of this variable is overloaded in the back-end
to decide whether or not we are on a Darwin target. This is done in
PPCInstPrinter (the "SyntaxVariant" is initialized from the MCAsmInfo
AssemblerDialect setting), and also in PPCMCExpr. Setting AssemblerDialect
to 1 for both Darwin and Linux no longer allows us to make this distinction.
Instead, this patch uses the MCSubtargetInfo passed to createPPCMCInstPrinter
to distinguish Darwin targets, and ignores the SyntaxVariant parameter.
As to PPCMCExpr, this patch adds an explicit isDarwin argument that needs
to be passed in by the caller when creating a target MCExpr. (To do so
this patch implicitly also reverts commit 184441.)
llvm-svn: 185858
2013-07-09 04:20:51 +08:00
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
2013-03-26 18:55:20 +08:00
|
|
|
// Change the opcode to ADDI8. If the global address is external, then
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
// generate a TOC entry and reference that. Otherwise reference the
|
|
|
|
// symbol directly.
|
2013-03-26 18:55:20 +08:00
|
|
|
TmpInst.setOpcode(PPC::ADDI8);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL");
|
2014-04-25 13:30:21 +08:00
|
|
|
MCSymbol *MOSymbol = nullptr;
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
bool IsExternal = false;
|
2014-06-17 05:36:02 +08:00
|
|
|
bool IsNonLocalFunction = false;
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
|
|
|
if (MO.isGlobal()) {
|
2014-05-29 23:41:38 +08:00
|
|
|
const GlobalValue *GV = MO.getGlobal();
|
|
|
|
MOSymbol = getSymbol(GV);
|
|
|
|
IsExternal = GV->isDeclaration();
|
2014-06-17 05:36:02 +08:00
|
|
|
IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
|
|
|
|
(GV->isDeclaration() || GV->isWeakForLinker());
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
} else if (MO.isCPI())
|
|
|
|
MOSymbol = GetCPISymbol(MO.getIndex());
|
|
|
|
|
2014-06-17 05:36:02 +08:00
|
|
|
if (IsNonLocalFunction || IsExternal ||
|
|
|
|
TM.getCodeModel() == CodeModel::Large)
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
|
|
|
|
|
|
|
|
const MCExpr *Exp =
|
2013-06-21 22:42:20 +08:00
|
|
|
MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
OutContext);
|
|
|
|
TmpInst.getOperand(2) = MCOperand::CreateExpr(Exp);
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, TmpInst);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
return;
|
|
|
|
}
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
case PPC::ADDISgotTprelHA: {
|
|
|
|
// Transform: %Xd = ADDISgotTprelHA %X2, <ga:@sym>
|
|
|
|
// Into: %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
|
|
|
|
assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
|
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
const MCExpr *SymGotTprel =
|
2013-06-21 22:42:20 +08:00
|
|
|
MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
OutContext);
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8)
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(PPC::X2)
|
|
|
|
.addExpr(SymGotTprel));
|
|
|
|
return;
|
|
|
|
}
|
2013-12-21 02:08:54 +08:00
|
|
|
case PPC::LDgotTprelL:
|
|
|
|
case PPC::LDgotTprelL32: {
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
// Transform %Xd = LDgotTprelL <ga:@sym>, %Xs
|
[PowerPC] Always use "assembler dialect" 1
A setting in MCAsmInfo defines the "assembler dialect" to use. This is used
by common code to choose between alternatives in a multi-alternative GNU
inline asm statement like the following:
__asm__ ("{sfe|subfe} %0,%1,%2" : "=r" (out) : "r" (in1), "r" (in2));
The meaning of these dialects is platform specific, and GCC defines those
for PowerPC to use dialect 0 for old-style (POWER) mnemonics and 1 for
new-style (PowerPC) mnemonics, like in the example above.
To be compatible with inline asm used with GCC, LLVM ought to do the same.
Specifically, this means we should always use assembler dialect 1 since
old-style mnemonics really aren't supported on any current platform.
However, the current LLVM back-end uses:
AssemblerDialect = 1; // New-Style mnemonics.
in PPCMCAsmInfoDarwin, and
AssemblerDialect = 0; // Old-Style mnemonics.
in PPCLinuxMCAsmInfo.
The Linux setting really isn't correct, we should be using new-style
mnemonics everywhere. This is changed by this commit.
Unfortunately, the setting of this variable is overloaded in the back-end
to decide whether or not we are on a Darwin target. This is done in
PPCInstPrinter (the "SyntaxVariant" is initialized from the MCAsmInfo
AssemblerDialect setting), and also in PPCMCExpr. Setting AssemblerDialect
to 1 for both Darwin and Linux no longer allows us to make this distinction.
Instead, this patch uses the MCSubtargetInfo passed to createPPCMCInstPrinter
to distinguish Darwin targets, and ignores the SyntaxVariant parameter.
As to PPCMCExpr, this patch adds an explicit isDarwin argument that needs
to be passed in by the caller when creating a target MCExpr. (To do so
this patch implicitly also reverts commit 184441.)
llvm-svn: 185858
2013-07-09 04:20:51 +08:00
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
|
2012-12-05 00:18:08 +08:00
|
|
|
|
2013-03-26 18:55:45 +08:00
|
|
|
// Change the opcode to LD.
|
2013-12-21 02:08:54 +08:00
|
|
|
TmpInst.setOpcode(isPPC64 ? PPC::LD : PPC::LWZ);
|
2012-12-05 00:18:08 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(1);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
2012-12-05 00:18:08 +08:00
|
|
|
const MCExpr *Exp =
|
2013-06-21 22:42:20 +08:00
|
|
|
MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO,
|
2012-12-05 00:18:08 +08:00
|
|
|
OutContext);
|
|
|
|
TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, TmpInst);
|
2012-12-05 00:18:08 +08:00
|
|
|
return;
|
|
|
|
}
|
2013-12-21 02:08:54 +08:00
|
|
|
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::PPC32PICGOT: {
|
|
|
|
MCSymbol *GOTSymbol = OutContext.GetOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
|
|
|
|
MCSymbol *GOTRef = OutContext.CreateTempSymbol();
|
|
|
|
MCSymbol *NextInstr = OutContext.CreateTempSymbol();
|
|
|
|
|
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL)
|
|
|
|
// FIXME: We would like an efficient form for this, so we don't have to do
|
|
|
|
// a lot of extra uniquing.
|
|
|
|
.addExpr(MCSymbolRefExpr::Create(NextInstr, OutContext)));
|
|
|
|
const MCExpr *OffsExpr =
|
|
|
|
MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(GOTSymbol, OutContext),
|
|
|
|
MCSymbolRefExpr::Create(GOTRef, OutContext),
|
|
|
|
OutContext);
|
|
|
|
OutStreamer.EmitLabel(GOTRef);
|
|
|
|
OutStreamer.EmitValue(OffsExpr, 4);
|
|
|
|
OutStreamer.EmitLabel(NextInstr);
|
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MFLR)
|
|
|
|
.addReg(MI->getOperand(0).getReg()));
|
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LWZ)
|
|
|
|
.addReg(MI->getOperand(1).getReg())
|
|
|
|
.addImm(0)
|
|
|
|
.addReg(MI->getOperand(0).getReg()));
|
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADD4)
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(MI->getOperand(1).getReg())
|
|
|
|
.addReg(MI->getOperand(0).getReg()));
|
|
|
|
return;
|
|
|
|
}
|
2013-12-21 02:08:54 +08:00
|
|
|
case PPC::PPC32GOT: {
|
|
|
|
MCSymbol *GOTSymbol = OutContext.GetOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
|
|
|
|
const MCExpr *SymGotTlsL =
|
|
|
|
MCSymbolRefExpr::Create(GOTSymbol, MCSymbolRefExpr::VK_PPC_LO,
|
|
|
|
OutContext);
|
|
|
|
const MCExpr *SymGotTlsHA =
|
|
|
|
MCSymbolRefExpr::Create(GOTSymbol, MCSymbolRefExpr::VK_PPC_HA,
|
|
|
|
OutContext);
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI)
|
2013-12-21 02:08:54 +08:00
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addExpr(SymGotTlsL));
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS)
|
2013-12-21 02:08:54 +08:00
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addExpr(SymGotTlsHA));
|
|
|
|
return;
|
|
|
|
}
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
case PPC::ADDIStlsgdHA: {
|
|
|
|
// Transform: %Xd = ADDIStlsgdHA %X2, <ga:@sym>
|
|
|
|
// Into: %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
|
|
|
|
assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
|
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
const MCExpr *SymGotTlsGD =
|
2013-06-21 22:42:20 +08:00
|
|
|
MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA,
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
OutContext);
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8)
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(PPC::X2)
|
|
|
|
.addExpr(SymGotTlsGD));
|
|
|
|
return;
|
|
|
|
}
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDItlsgdL:
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
// Transform: %Xd = ADDItlsgdL %Xs, <ga:@sym>
|
2013-03-26 18:55:20 +08:00
|
|
|
// Into: %Xd = ADDI8 %Xs, sym@got@tlsgd@l
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDItlsgdL32: {
|
|
|
|
// Transform: %Rd = ADDItlsgdL32 %Rs, <ga:@sym>
|
|
|
|
// Into: %Rd = ADDI %Rs, sym@got@tlsgd
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
const MCExpr *SymGotTlsGD =
|
2014-07-26 01:47:22 +08:00
|
|
|
MCSymbolRefExpr::Create(MOSymbol, Subtarget.isPPC64() ?
|
|
|
|
MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO :
|
|
|
|
MCSymbolRefExpr::VK_PPC_GOT_TLSGD,
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
OutContext);
|
2014-07-26 01:47:22 +08:00
|
|
|
EmitToStreamer(OutStreamer,
|
|
|
|
MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI)
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(MI->getOperand(1).getReg())
|
|
|
|
.addExpr(SymGotTlsGD));
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
return;
|
|
|
|
}
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::GETtlsADDR:
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
// Transform: %X3 = GETtlsADDR %X3, <ga:@sym>
|
2013-07-03 05:31:04 +08:00
|
|
|
// Into: BL8_NOP_TLS __tls_get_addr(sym@tlsgd)
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::GETtlsADDR32: {
|
|
|
|
// Transform: %R3 = GETtlsADDR32 %R3, <ga:@sym>
|
|
|
|
// Into: BL_TLS __tls_get_addr(sym@tlsgd)@PLT
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
|
|
|
|
StringRef Name = "__tls_get_addr";
|
|
|
|
MCSymbol *TlsGetAddr = OutContext.GetOrCreateSymbol(Name);
|
2014-07-26 01:47:22 +08:00
|
|
|
MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
|
|
|
|
|
|
|
|
if (!Subtarget.isPPC64() && !Subtarget.isDarwin() &&
|
|
|
|
TM.getRelocationModel() == Reloc::PIC_)
|
|
|
|
Kind = MCSymbolRefExpr::VK_PLT;
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
const MCSymbolRefExpr *TlsRef =
|
2014-07-26 01:47:22 +08:00
|
|
|
MCSymbolRefExpr::Create(TlsGetAddr, Kind, OutContext);
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
const MCExpr *SymVar =
|
[PowerPC] Revert r185476 and fix up TLS variant kinds
In the commit message to r185476 I wrote:
>The PowerPC-specific modifiers VK_PPC_TLSGD and VK_PPC_TLSLD
>correspond exactly to the generic modifiers VK_TLSGD and VK_TLSLD.
>This causes some confusion with the asm parser, since VK_PPC_TLSGD
>is output as @tlsgd, which is then read back in as VK_TLSGD.
>
>To avoid this confusion, this patch removes the PowerPC-specific
>modifiers and uses the generic modifiers throughout. (The only
>drawback is that the generic modifiers are printed in upper case
>while the usual convention on PowerPC is to use lower-case modifiers.
>But this is just a cosmetic issue.)
This was unfortunately incorrect, there is is fact another,
serious drawback to using the default VK_TLSLD/VK_TLSGD
variant kinds: using these causes ELFObjectWriter::RelocNeedsGOT
to return true, which in turn causes the ELFObjectWriter to emit
an undefined reference to _GLOBAL_OFFSET_TABLE_.
This is a problem on powerpc64, because it uses the TOC instead
of the GOT, and the linker does not provide _GLOBAL_OFFSET_TABLE_,
so the symbol remains undefined. This means shared libraries
using TLS built with the integrated assembler are currently
broken.
While the whole RelocNeedsGOT / _GLOBAL_OFFSET_TABLE_ situation
probably ought to be properly fixed at some point, for now I'm
simply reverting the r185476 commit. Now this in turn exposes
the breakage of handling @tlsgd/@tlsld in the asm parser that
this check-in was originally intended to fix.
To avoid this regression, I'm also adding a different fix for
this problem: while common code now parses @tlsgd as VK_TLSGD,
a special hack in the asm parser translates this code to the
platform-specific VK_PPC_TLSGD that the back-end now expects.
While this is not really pretty, it's self-contained and
shouldn't hurt anything else for now. One the underlying
problem is fixed, this hack can be reverted again.
llvm-svn: 185945
2013-07-10 00:41:09 +08:00
|
|
|
MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSGD,
|
|
|
|
OutContext);
|
2014-07-26 01:47:22 +08:00
|
|
|
EmitToStreamer(OutStreamer,
|
|
|
|
MCInstBuilder(Subtarget.isPPC64() ?
|
|
|
|
PPC::BL8_NOP_TLS : PPC::BL_TLS)
|
|
|
|
.addExpr(TlsRef)
|
|
|
|
.addExpr(SymVar));
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
return;
|
|
|
|
}
|
2012-12-13 03:29:35 +08:00
|
|
|
case PPC::ADDIStlsldHA: {
|
|
|
|
// Transform: %Xd = ADDIStlsldHA %X2, <ga:@sym>
|
|
|
|
// Into: %Xd = ADDIS8 %X2, sym@got@tlsld@ha
|
|
|
|
assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
|
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
2012-12-13 03:29:35 +08:00
|
|
|
const MCExpr *SymGotTlsLD =
|
2013-06-21 22:42:20 +08:00
|
|
|
MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA,
|
2012-12-13 03:29:35 +08:00
|
|
|
OutContext);
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8)
|
2012-12-13 03:29:35 +08:00
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(PPC::X2)
|
|
|
|
.addExpr(SymGotTlsLD));
|
|
|
|
return;
|
|
|
|
}
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDItlsldL:
|
2012-12-13 03:29:35 +08:00
|
|
|
// Transform: %Xd = ADDItlsldL %Xs, <ga:@sym>
|
2013-03-26 18:55:20 +08:00
|
|
|
// Into: %Xd = ADDI8 %Xs, sym@got@tlsld@l
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDItlsldL32: {
|
|
|
|
// Transform: %Rd = ADDItlsldL32 %Rs, <ga:@sym>
|
|
|
|
// Into: %Rd = ADDI %Rs, sym@got@tlsld
|
2012-12-13 03:29:35 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
2012-12-13 03:29:35 +08:00
|
|
|
const MCExpr *SymGotTlsLD =
|
2014-07-26 01:47:22 +08:00
|
|
|
MCSymbolRefExpr::Create(MOSymbol, Subtarget.isPPC64() ?
|
|
|
|
MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO :
|
|
|
|
MCSymbolRefExpr::VK_PPC_GOT_TLSLD,
|
2012-12-13 03:29:35 +08:00
|
|
|
OutContext);
|
2014-07-26 01:47:22 +08:00
|
|
|
EmitToStreamer(OutStreamer,
|
|
|
|
MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI)
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(MI->getOperand(1).getReg())
|
|
|
|
.addExpr(SymGotTlsLD));
|
2012-12-13 03:29:35 +08:00
|
|
|
return;
|
|
|
|
}
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::GETtlsldADDR:
|
2012-12-13 03:29:35 +08:00
|
|
|
// Transform: %X3 = GETtlsldADDR %X3, <ga:@sym>
|
2013-07-03 05:31:04 +08:00
|
|
|
// Into: BL8_NOP_TLS __tls_get_addr(sym@tlsld)
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::GETtlsldADDR32: {
|
|
|
|
// Transform: %R3 = GETtlsldADDR32 %R3, <ga:@sym>
|
|
|
|
// Into: BL_TLS __tls_get_addr(sym@tlsld)@PLT
|
2012-12-13 03:29:35 +08:00
|
|
|
|
|
|
|
StringRef Name = "__tls_get_addr";
|
|
|
|
MCSymbol *TlsGetAddr = OutContext.GetOrCreateSymbol(Name);
|
2014-07-26 01:47:22 +08:00
|
|
|
MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
|
|
|
|
|
|
|
|
if (!Subtarget.isPPC64() && !Subtarget.isDarwin() &&
|
|
|
|
TM.getRelocationModel() == Reloc::PIC_)
|
|
|
|
Kind = MCSymbolRefExpr::VK_PLT;
|
|
|
|
|
2012-12-13 03:29:35 +08:00
|
|
|
const MCSymbolRefExpr *TlsRef =
|
2014-07-26 01:47:22 +08:00
|
|
|
MCSymbolRefExpr::Create(TlsGetAddr, Kind, OutContext);
|
2012-12-13 03:29:35 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
2012-12-13 03:29:35 +08:00
|
|
|
const MCExpr *SymVar =
|
[PowerPC] Revert r185476 and fix up TLS variant kinds
In the commit message to r185476 I wrote:
>The PowerPC-specific modifiers VK_PPC_TLSGD and VK_PPC_TLSLD
>correspond exactly to the generic modifiers VK_TLSGD and VK_TLSLD.
>This causes some confusion with the asm parser, since VK_PPC_TLSGD
>is output as @tlsgd, which is then read back in as VK_TLSGD.
>
>To avoid this confusion, this patch removes the PowerPC-specific
>modifiers and uses the generic modifiers throughout. (The only
>drawback is that the generic modifiers are printed in upper case
>while the usual convention on PowerPC is to use lower-case modifiers.
>But this is just a cosmetic issue.)
This was unfortunately incorrect, there is is fact another,
serious drawback to using the default VK_TLSLD/VK_TLSGD
variant kinds: using these causes ELFObjectWriter::RelocNeedsGOT
to return true, which in turn causes the ELFObjectWriter to emit
an undefined reference to _GLOBAL_OFFSET_TABLE_.
This is a problem on powerpc64, because it uses the TOC instead
of the GOT, and the linker does not provide _GLOBAL_OFFSET_TABLE_,
so the symbol remains undefined. This means shared libraries
using TLS built with the integrated assembler are currently
broken.
While the whole RelocNeedsGOT / _GLOBAL_OFFSET_TABLE_ situation
probably ought to be properly fixed at some point, for now I'm
simply reverting the r185476 commit. Now this in turn exposes
the breakage of handling @tlsgd/@tlsld in the asm parser that
this check-in was originally intended to fix.
To avoid this regression, I'm also adding a different fix for
this problem: while common code now parses @tlsgd as VK_TLSGD,
a special hack in the asm parser translates this code to the
platform-specific VK_PPC_TLSGD that the back-end now expects.
While this is not really pretty, it's self-contained and
shouldn't hurt anything else for now. One the underlying
problem is fixed, this hack can be reverted again.
llvm-svn: 185945
2013-07-10 00:41:09 +08:00
|
|
|
MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSLD,
|
|
|
|
OutContext);
|
2014-07-26 01:47:22 +08:00
|
|
|
EmitToStreamer(OutStreamer,
|
|
|
|
MCInstBuilder(Subtarget.isPPC64() ?
|
|
|
|
PPC::BL8_NOP_TLS : PPC::BL_TLS)
|
|
|
|
.addExpr(TlsRef)
|
|
|
|
.addExpr(SymVar));
|
2012-12-13 03:29:35 +08:00
|
|
|
return;
|
|
|
|
}
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDISdtprelHA:
|
2012-12-13 03:29:35 +08:00
|
|
|
// Transform: %Xd = ADDISdtprelHA %X3, <ga:@sym>
|
|
|
|
// Into: %Xd = ADDIS8 %X3, sym@dtprel@ha
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDISdtprelHA32: {
|
|
|
|
// Transform: %Rd = ADDISdtprelHA32 %R3, <ga:@sym>
|
|
|
|
// Into: %Rd = ADDIS %R3, sym@dtprel@ha
|
2012-12-13 03:29:35 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
2012-12-13 03:29:35 +08:00
|
|
|
const MCExpr *SymDtprel =
|
2013-06-21 22:42:20 +08:00
|
|
|
MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA,
|
2012-12-13 03:29:35 +08:00
|
|
|
OutContext);
|
2014-07-26 01:47:22 +08:00
|
|
|
EmitToStreamer(OutStreamer,
|
|
|
|
MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDIS8 : PPC::ADDIS)
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(Subtarget.isPPC64() ? PPC::X3 : PPC::R3)
|
|
|
|
.addExpr(SymDtprel));
|
2012-12-13 03:29:35 +08:00
|
|
|
return;
|
|
|
|
}
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDIdtprelL:
|
2012-12-13 03:29:35 +08:00
|
|
|
// Transform: %Xd = ADDIdtprelL %Xs, <ga:@sym>
|
2013-03-26 18:55:20 +08:00
|
|
|
// Into: %Xd = ADDI8 %Xs, sym@dtprel@l
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPC::ADDIdtprelL32: {
|
|
|
|
// Transform: %Rd = ADDIdtprelL32 %Rs, <ga:@sym>
|
|
|
|
// Into: %Rd = ADDI %Rs, sym@dtprel@l
|
2012-12-13 03:29:35 +08:00
|
|
|
const MachineOperand &MO = MI->getOperand(2);
|
|
|
|
const GlobalValue *GValue = MO.getGlobal();
|
2013-10-30 01:07:16 +08:00
|
|
|
MCSymbol *MOSymbol = getSymbol(GValue);
|
2012-12-13 03:29:35 +08:00
|
|
|
const MCExpr *SymDtprel =
|
2013-06-21 22:42:20 +08:00
|
|
|
MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
|
2012-12-13 03:29:35 +08:00
|
|
|
OutContext);
|
2014-07-26 01:47:22 +08:00
|
|
|
EmitToStreamer(OutStreamer,
|
|
|
|
MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI)
|
|
|
|
.addReg(MI->getOperand(0).getReg())
|
|
|
|
.addReg(MI->getOperand(1).getReg())
|
|
|
|
.addExpr(SymDtprel));
|
2012-12-13 03:29:35 +08:00
|
|
|
return;
|
|
|
|
}
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
llvm-svn: 185556
2013-07-04 01:05:42 +08:00
|
|
|
case PPC::MFOCRF:
|
|
|
|
case PPC::MFOCRF8:
|
|
|
|
if (!Subtarget.hasMFOCRF()) {
|
|
|
|
// Transform: %R3 = MFOCRF %CR7
|
|
|
|
// Into: %R3 = MFCR ;; cr7
|
|
|
|
unsigned NewOpcode =
|
|
|
|
MI->getOpcode() == PPC::MFOCRF ? PPC::MFCR : PPC::MFCR8;
|
|
|
|
OutStreamer.AddComment(PPCInstPrinter::
|
|
|
|
getRegisterName(MI->getOperand(1).getReg()));
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(NewOpcode)
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
llvm-svn: 185556
2013-07-04 01:05:42 +08:00
|
|
|
.addReg(MI->getOperand(0).getReg()));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
break;
|
2013-07-04 01:59:07 +08:00
|
|
|
case PPC::MTOCRF:
|
|
|
|
case PPC::MTOCRF8:
|
|
|
|
if (!Subtarget.hasMFOCRF()) {
|
|
|
|
// Transform: %CR7 = MTOCRF %R3
|
|
|
|
// Into: MTCRF mask, %R3 ;; cr7
|
|
|
|
unsigned NewOpcode =
|
|
|
|
MI->getOpcode() == PPC::MTOCRF ? PPC::MTCRF : PPC::MTCRF8;
|
|
|
|
unsigned Mask = 0x80 >> OutContext.getRegisterInfo()
|
|
|
|
->getEncodingValue(MI->getOperand(0).getReg());
|
|
|
|
OutStreamer.AddComment(PPCInstPrinter::
|
|
|
|
getRegisterName(MI->getOperand(0).getReg()));
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(NewOpcode)
|
2013-07-04 01:59:07 +08:00
|
|
|
.addImm(Mask)
|
|
|
|
.addReg(MI->getOperand(1).getReg()));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
break;
|
Index: test/CodeGen/PowerPC/reloc-align.ll
===================================================================
--- test/CodeGen/PowerPC/reloc-align.ll (revision 0)
+++ test/CodeGen/PowerPC/reloc-align.ll (revision 0)
@@ -0,0 +1,34 @@
+; RUN: llc -mcpu=pwr7 -O1 < %s | FileCheck %s
+
+; This test verifies that the peephole optimization of address accesses
+; does not produce a load or store with a relocation that can't be
+; satisfied for a given instruction encoding. Reduced from a test supplied
+; by Hal Finkel.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S1 = type { [8 x i8] }
+
+@main.l_1554 = internal global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 -1, i8 -6, i8 57, i8 62, i8 -48, i8 0, i8 58, i8 80 }, align 1
+
+; Function Attrs: nounwind readonly
+define signext i32 @main() #0 {
+entry:
+ %call = tail call fastcc signext i32 @func_90(%struct.S1* byval bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @main.l_1554 to %struct.S1*))
+; CHECK-NOT: ld {{[0-9]+}}, main.l_1554@toc@l
+ ret i32 %call
+}
+
+; Function Attrs: nounwind readonly
+define internal fastcc signext i32 @func_90(%struct.S1* byval nocapture %p_91) #0 {
+entry:
+ %0 = bitcast %struct.S1* %p_91 to i64*
+ %bf.load = load i64* %0, align 1
+ %bf.shl = shl i64 %bf.load, 26
+ %bf.ashr = ashr i64 %bf.shl, 54
+ %bf.cast = trunc i64 %bf.ashr to i32
+ ret i32 %bf.cast
+}
+
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: lib/Target/PowerPC/PPCAsmPrinter.cpp
===================================================================
--- lib/Target/PowerPC/PPCAsmPrinter.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCAsmPrinter.cpp (working copy)
@@ -679,7 +679,26 @@ void PPCAsmPrinter::EmitInstruction(const MachineI
OutStreamer.EmitRawText(StringRef("\tmsync"));
return;
}
+ break;
+ case PPC::LD:
+ case PPC::STD:
+ case PPC::LWA: {
+ // Verify alignment is legal, so we don't create relocations
+ // that can't be supported.
+ // FIXME: This test is currently disabled for Darwin. The test
+ // suite shows a handful of test cases that fail this check for
+ // Darwin. Those need to be investigated before this sanity test
+ // can be enabled for those subtargets.
+ if (!Subtarget.isDarwin()) {
+ unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
+ llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
+ }
+ // Now process the instruction normally.
+ break;
}
+ }
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
OutStreamer.EmitInstruction(TmpInst);
Index: lib/Target/PowerPC/PPCISelDAGToDAG.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelDAGToDAG.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCISelDAGToDAG.cpp (working copy)
@@ -1530,6 +1530,14 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
SDLoc dl(GA);
const GlobalValue *GV = GA->getGlobal();
+ // We can't perform this optimization for data whose alignment
+ // is insufficient for the instruction encoding.
+ if (GV->getAlignment() < 4 &&
+ (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
+ StorageOpcode == PPC::LWA)) {
+ DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
+ continue;
+ }
ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
} else if (ConstantPoolSDNode *CP =
dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
llvm-svn: 185380
2013-07-02 04:52:27 +08:00
|
|
|
case PPC::LD:
|
|
|
|
case PPC::STD:
|
2013-08-30 23:18:11 +08:00
|
|
|
case PPC::LWA_32:
|
Index: test/CodeGen/PowerPC/reloc-align.ll
===================================================================
--- test/CodeGen/PowerPC/reloc-align.ll (revision 0)
+++ test/CodeGen/PowerPC/reloc-align.ll (revision 0)
@@ -0,0 +1,34 @@
+; RUN: llc -mcpu=pwr7 -O1 < %s | FileCheck %s
+
+; This test verifies that the peephole optimization of address accesses
+; does not produce a load or store with a relocation that can't be
+; satisfied for a given instruction encoding. Reduced from a test supplied
+; by Hal Finkel.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S1 = type { [8 x i8] }
+
+@main.l_1554 = internal global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 -1, i8 -6, i8 57, i8 62, i8 -48, i8 0, i8 58, i8 80 }, align 1
+
+; Function Attrs: nounwind readonly
+define signext i32 @main() #0 {
+entry:
+ %call = tail call fastcc signext i32 @func_90(%struct.S1* byval bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @main.l_1554 to %struct.S1*))
+; CHECK-NOT: ld {{[0-9]+}}, main.l_1554@toc@l
+ ret i32 %call
+}
+
+; Function Attrs: nounwind readonly
+define internal fastcc signext i32 @func_90(%struct.S1* byval nocapture %p_91) #0 {
+entry:
+ %0 = bitcast %struct.S1* %p_91 to i64*
+ %bf.load = load i64* %0, align 1
+ %bf.shl = shl i64 %bf.load, 26
+ %bf.ashr = ashr i64 %bf.shl, 54
+ %bf.cast = trunc i64 %bf.ashr to i32
+ ret i32 %bf.cast
+}
+
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: lib/Target/PowerPC/PPCAsmPrinter.cpp
===================================================================
--- lib/Target/PowerPC/PPCAsmPrinter.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCAsmPrinter.cpp (working copy)
@@ -679,7 +679,26 @@ void PPCAsmPrinter::EmitInstruction(const MachineI
OutStreamer.EmitRawText(StringRef("\tmsync"));
return;
}
+ break;
+ case PPC::LD:
+ case PPC::STD:
+ case PPC::LWA: {
+ // Verify alignment is legal, so we don't create relocations
+ // that can't be supported.
+ // FIXME: This test is currently disabled for Darwin. The test
+ // suite shows a handful of test cases that fail this check for
+ // Darwin. Those need to be investigated before this sanity test
+ // can be enabled for those subtargets.
+ if (!Subtarget.isDarwin()) {
+ unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
+ llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
+ }
+ // Now process the instruction normally.
+ break;
}
+ }
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
OutStreamer.EmitInstruction(TmpInst);
Index: lib/Target/PowerPC/PPCISelDAGToDAG.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelDAGToDAG.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCISelDAGToDAG.cpp (working copy)
@@ -1530,6 +1530,14 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
SDLoc dl(GA);
const GlobalValue *GV = GA->getGlobal();
+ // We can't perform this optimization for data whose alignment
+ // is insufficient for the instruction encoding.
+ if (GV->getAlignment() < 4 &&
+ (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
+ StorageOpcode == PPC::LWA)) {
+ DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
+ continue;
+ }
ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
} else if (ConstantPoolSDNode *CP =
dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
llvm-svn: 185380
2013-07-02 04:52:27 +08:00
|
|
|
case PPC::LWA: {
|
|
|
|
// Verify alignment is legal, so we don't create relocations
|
|
|
|
// that can't be supported.
|
|
|
|
// FIXME: This test is currently disabled for Darwin. The test
|
|
|
|
// suite shows a handful of test cases that fail this check for
|
|
|
|
// Darwin. Those need to be investigated before this sanity test
|
|
|
|
// can be enabled for those subtargets.
|
|
|
|
if (!Subtarget.isDarwin()) {
|
|
|
|
unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
|
|
|
|
const MachineOperand &MO = MI->getOperand(OpNum);
|
|
|
|
if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
|
|
|
|
llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
|
|
|
|
}
|
|
|
|
// Now process the instruction normally.
|
|
|
|
break;
|
|
|
|
}
|
2010-01-28 09:28:58 +08:00
|
|
|
}
|
2005-04-22 07:30:14 +08:00
|
|
|
|
[PowerPC] Always use "assembler dialect" 1
A setting in MCAsmInfo defines the "assembler dialect" to use. This is used
by common code to choose between alternatives in a multi-alternative GNU
inline asm statement like the following:
__asm__ ("{sfe|subfe} %0,%1,%2" : "=r" (out) : "r" (in1), "r" (in2));
The meaning of these dialects is platform specific, and GCC defines those
for PowerPC to use dialect 0 for old-style (POWER) mnemonics and 1 for
new-style (PowerPC) mnemonics, like in the example above.
To be compatible with inline asm used with GCC, LLVM ought to do the same.
Specifically, this means we should always use assembler dialect 1 since
old-style mnemonics really aren't supported on any current platform.
However, the current LLVM back-end uses:
AssemblerDialect = 1; // New-Style mnemonics.
in PPCMCAsmInfoDarwin, and
AssemblerDialect = 0; // Old-Style mnemonics.
in PPCLinuxMCAsmInfo.
The Linux setting really isn't correct, we should be using new-style
mnemonics everywhere. This is changed by this commit.
Unfortunately, the setting of this variable is overloaded in the back-end
to decide whether or not we are on a Darwin target. This is done in
PPCInstPrinter (the "SyntaxVariant" is initialized from the MCAsmInfo
AssemblerDialect setting), and also in PPCMCExpr. Setting AssemblerDialect
to 1 for both Darwin and Linux no longer allows us to make this distinction.
Instead, this patch uses the MCSubtargetInfo passed to createPPCMCInstPrinter
to distinguish Darwin targets, and ignores the SyntaxVariant parameter.
As to PPCMCExpr, this patch adds an explicit isDarwin argument that needs
to be passed in by the caller when creating a target MCExpr. (To do so
this patch implicitly also reverts commit 184441.)
llvm-svn: 185858
2013-07-09 04:20:51 +08:00
|
|
|
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, TmpInst);
|
2004-09-04 13:00:00 +08:00
|
|
|
}
|
|
|
|
|
2014-07-19 07:29:49 +08:00
|
|
|
void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
if (Subtarget.isELFv2ABI()) {
|
|
|
|
PPCTargetStreamer *TS =
|
|
|
|
static_cast<PPCTargetStreamer *>(OutStreamer.getTargetStreamer());
|
|
|
|
|
|
|
|
if (TS)
|
|
|
|
TS->emitAbiVersion(2);
|
|
|
|
}
|
|
|
|
|
2014-07-19 07:29:49 +08:00
|
|
|
if (Subtarget.isPPC64() || TM.getRelocationModel() != Reloc::PIC_)
|
|
|
|
return AsmPrinter::EmitStartOfAsmFile(M);
|
|
|
|
|
|
|
|
// FIXME: The use of .got2 assumes large GOT model (-fPIC), which is not
|
|
|
|
// optimal for some cases. We should consider supporting small model (-fpic)
|
|
|
|
// as well in the future.
|
|
|
|
assert(TM.getCodeModel() != CodeModel::Small &&
|
|
|
|
"Small code model PIC is currently unsupported.");
|
|
|
|
OutStreamer.SwitchSection(OutContext.getELFSection(".got2",
|
|
|
|
ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
|
|
|
|
SectionKind::getReadOnly()));
|
|
|
|
|
|
|
|
MCSymbol *TOCSym = OutContext.GetOrCreateSymbol(Twine(".L.TOC."));
|
|
|
|
MCSymbol *CurrentPos = OutContext.CreateTempSymbol();
|
|
|
|
|
|
|
|
OutStreamer.EmitLabel(CurrentPos);
|
|
|
|
|
|
|
|
// The GOT pointer points to the middle of the GOT, in order to reference the
|
|
|
|
// entire 64kB range. 0x8000 is the midpoint.
|
|
|
|
const MCExpr *tocExpr =
|
|
|
|
MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(CurrentPos, OutContext),
|
|
|
|
MCConstantExpr::Create(0x8000, OutContext),
|
|
|
|
OutContext);
|
|
|
|
|
|
|
|
OutStreamer.EmitAssignment(TOCSym, tocExpr);
|
|
|
|
|
|
|
|
OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
|
|
|
|
}
|
|
|
|
|
2010-01-27 15:21:55 +08:00
|
|
|
void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
|
2014-07-19 07:29:49 +08:00
|
|
|
// linux/ppc32 - Normal entry label.
|
|
|
|
if (!Subtarget.isPPC64() && TM.getRelocationModel() != Reloc::PIC_)
|
2010-01-27 15:21:55 +08:00
|
|
|
return AsmPrinter::EmitFunctionEntryLabel();
|
2014-07-19 07:29:49 +08:00
|
|
|
|
|
|
|
if (!Subtarget.isPPC64()) {
|
|
|
|
const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
|
|
|
|
if (PPCFI->usesPICBase()) {
|
|
|
|
MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol();
|
|
|
|
MCSymbol *PICBase = MF->getPICBaseSymbol();
|
|
|
|
OutStreamer.EmitLabel(RelocSymbol);
|
|
|
|
|
|
|
|
const MCExpr *OffsExpr =
|
|
|
|
MCBinaryExpr::CreateSub(
|
|
|
|
MCSymbolRefExpr::Create(OutContext.GetOrCreateSymbol(Twine(".L.TOC.")),
|
|
|
|
OutContext),
|
|
|
|
MCSymbolRefExpr::Create(PICBase, OutContext),
|
|
|
|
OutContext);
|
|
|
|
OutStreamer.EmitValue(OffsExpr, 4);
|
|
|
|
OutStreamer.EmitLabel(CurrentFnSym);
|
|
|
|
return;
|
|
|
|
} else
|
|
|
|
return AsmPrinter::EmitFunctionEntryLabel();
|
|
|
|
}
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
|
|
|
|
// ELFv2 ABI - Normal entry label.
|
|
|
|
if (Subtarget.isELFv2ABI())
|
|
|
|
return AsmPrinter::EmitFunctionEntryLabel();
|
|
|
|
|
2010-01-27 15:21:55 +08:00
|
|
|
// Emit an official procedure descriptor.
|
2013-04-18 05:18:16 +08:00
|
|
|
MCSectionSubPair Current = OutStreamer.getCurrentSection();
|
2012-02-28 04:20:47 +08:00
|
|
|
const MCSectionELF *Section = OutStreamer.getContext().getELFSection(".opd",
|
|
|
|
ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
|
|
|
|
SectionKind::getReadOnly());
|
|
|
|
OutStreamer.SwitchSection(Section);
|
2010-01-27 15:21:55 +08:00
|
|
|
OutStreamer.EmitLabel(CurrentFnSym);
|
2012-02-28 04:20:47 +08:00
|
|
|
OutStreamer.EmitValueToAlignment(8);
|
|
|
|
MCSymbol *Symbol1 =
|
|
|
|
OutContext.GetOrCreateSymbol(".L." + Twine(CurrentFnSym->getName()));
|
2012-10-25 20:27:42 +08:00
|
|
|
// Generates a R_PPC64_ADDR64 (from FK_DATA_8) relocation for the function
|
|
|
|
// entry point.
|
2012-02-28 04:20:47 +08:00
|
|
|
OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol1, OutContext),
|
2013-01-09 09:57:54 +08:00
|
|
|
8 /*size*/);
|
2012-10-25 20:27:42 +08:00
|
|
|
MCSymbol *Symbol2 = OutContext.GetOrCreateSymbol(StringRef(".TOC."));
|
|
|
|
// Generates a R_PPC64_TOC relocation for TOC base insertion.
|
|
|
|
OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol2,
|
2013-06-21 06:39:42 +08:00
|
|
|
MCSymbolRefExpr::VK_PPC_TOCBASE, OutContext),
|
2013-01-09 09:57:54 +08:00
|
|
|
8/*size*/);
|
2012-09-19 00:55:29 +08:00
|
|
|
// Emit a null environment pointer.
|
2013-01-09 09:57:54 +08:00
|
|
|
OutStreamer.EmitIntValue(0, 8 /* size */);
|
2013-04-18 05:18:16 +08:00
|
|
|
OutStreamer.SwitchSection(Current.first, Current.second);
|
2012-02-23 05:11:47 +08:00
|
|
|
|
|
|
|
MCSymbol *RealFnSym = OutContext.GetOrCreateSymbol(
|
|
|
|
".L." + Twine(CurrentFnSym->getName()));
|
|
|
|
OutStreamer.EmitLabel(RealFnSym);
|
|
|
|
CurrentFnSymForSize = RealFnSym;
|
2010-01-27 15:21:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-08-15 19:54:46 +08:00
|
|
|
bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
|
2012-10-09 00:38:25 +08:00
|
|
|
const DataLayout *TD = TM.getDataLayout();
|
2009-08-15 19:54:46 +08:00
|
|
|
|
Revert the majority of the next patch in the address space series:
r165941: Resubmit the changes to llvm core to update the functions to
support different pointer sizes on a per address space basis.
Despite this commit log, this change primarily changed stuff outside of
VMCore, and those changes do not carry any tests for correctness (or
even plausibility), and we have consistently found questionable or flat
out incorrect cases in these changes. Most of them are probably correct,
but we need to devise a system that makes it more clear when we have
handled the address space concerns correctly, and ideally each pass that
gets updated would receive an accompanying test case that exercises that
pass specificaly w.r.t. alternate address spaces.
However, from this commit, I have retained the new C API entry points.
Those were an orthogonal change that probably should have been split
apart, but they seem entirely good.
In several places the changes were very obvious cleanups with no actual
multiple address space code added; these I have not reverted when
I spotted them.
In a few other places there were merge conflicts due to a cleaner
solution being implemented later, often not using address spaces at all.
In those cases, I've preserved the new code which isn't address space
dependent.
This is part of my ongoing effort to clean out the partial address space
code which carries high risk and low test coverage, and not likely to be
finished before the 3.2 release looms closer. Duncan and I would both
like to see the above issues addressed before we return to these
changes.
llvm-svn: 167222
2012-11-01 17:14:31 +08:00
|
|
|
bool isPPC64 = TD->getPointerSizeInBits() == 64;
|
2009-08-15 19:54:46 +08:00
|
|
|
|
2013-10-08 21:08:17 +08:00
|
|
|
PPCTargetStreamer &TS =
|
2014-01-14 09:21:46 +08:00
|
|
|
static_cast<PPCTargetStreamer &>(*OutStreamer.getTargetStreamer());
|
2013-10-08 21:08:17 +08:00
|
|
|
|
2014-07-19 07:29:49 +08:00
|
|
|
if (!TOC.empty()) {
|
|
|
|
const MCSectionELF *Section;
|
|
|
|
|
|
|
|
if (isPPC64)
|
|
|
|
Section = OutStreamer.getContext().getELFSection(".toc",
|
|
|
|
ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
|
|
|
|
SectionKind::getReadOnly());
|
|
|
|
else
|
|
|
|
Section = OutStreamer.getContext().getELFSection(".got2",
|
2012-02-29 02:15:25 +08:00
|
|
|
ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
|
|
|
|
SectionKind::getReadOnly());
|
|
|
|
OutStreamer.SwitchSection(Section);
|
2009-08-15 19:54:46 +08:00
|
|
|
|
2012-11-13 03:13:24 +08:00
|
|
|
for (MapVector<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(),
|
2010-01-16 10:09:06 +08:00
|
|
|
E = TOC.end(); I != E; ++I) {
|
2010-04-04 15:05:53 +08:00
|
|
|
OutStreamer.EmitLabel(I->second);
|
2012-10-15 23:43:14 +08:00
|
|
|
MCSymbol *S = OutContext.GetOrCreateSymbol(I->first->getName());
|
2014-07-19 07:29:49 +08:00
|
|
|
if (isPPC64)
|
|
|
|
TS.emitTCEntry(*S);
|
|
|
|
else
|
|
|
|
OutStreamer.EmitSymbolValue(S, 4);
|
2009-08-15 19:54:46 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-01-10 01:08:15 +08:00
|
|
|
MachineModuleInfoELF &MMIELF =
|
|
|
|
MMI->getObjFileInfo<MachineModuleInfoELF>();
|
|
|
|
|
|
|
|
MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
|
|
|
|
if (!Stubs.empty()) {
|
|
|
|
OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
|
|
|
|
for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
|
|
|
|
// L_foo$stub:
|
|
|
|
OutStreamer.EmitLabel(Stubs[i].first);
|
|
|
|
// .long _foo
|
|
|
|
OutStreamer.EmitValue(MCSymbolRefExpr::Create(Stubs[i].second.getPointer(),
|
|
|
|
OutContext),
|
2013-07-02 23:49:13 +08:00
|
|
|
isPPC64 ? 8 : 4/*size*/);
|
2013-01-10 01:08:15 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
Stubs.clear();
|
|
|
|
OutStreamer.AddBlankLine();
|
|
|
|
}
|
|
|
|
|
2009-08-15 19:54:46 +08:00
|
|
|
return AsmPrinter::doFinalization(M);
|
|
|
|
}
|
2006-12-22 04:26:09 +08:00
|
|
|
|
[PowerPC] ELFv2 function call changes
This patch builds upon the two preceding MC changes to implement the
basic ELFv2 function call convention. In the ELFv1 ABI, a "function
descriptor" was associated with every function, pointing to both the
entry address and the related TOC base (and a static chain pointer
for nested functions). Function pointers would actually refer to that
descriptor, and the indirect call sequence needed to load up both entry
address and TOC base.
In the ELFv2 ABI, there are no more function descriptors, and function
pointers simply refer to the (global) entry point of the function code.
Indirect function calls simply branch to that address, after loading it
up into r12 (as required by the ABI rules for a global entry point).
Direct function calls continue to just do a "bl" to the target symbol;
this will be resolved by the linker to the local entry point of the
target function if it is local, and to a PLT stub if it is global.
That PLT stub would then load the (global) entry point address of the
final target into r12 and branch to it. Note that when performing a
local function call, r2 must be set up to point to the current TOC
base: if the target ends up local, the ABI requires that its local
entry point is called with r2 set up; if the target ends up global,
the PLT stub requires that r2 is set up.
This patch implements all LLVM changes to implement that scheme:
- No longer create a function descriptor when emitting a function
definition (in EmitFunctionEntryLabel)
- Emit two entry points *if* the function needs the TOC base (r2)
anywhere (this is done EmitFunctionBodyStart; note that this cannot
be done in EmitFunctionBodyStart because the global entry point
prologue code must be *part* of the function as covered by debug info).
- In order to make use tracking of r2 (as needed above) work correctly,
mark direct function calls as implicitly using r2.
- Implement the ELFv2 indirect function call sequence (no function
descriptors; load target address into r12).
- When creating an ELFv2 object file, emit the .abiversion 2 directive
to tell the linker to create the appropriate version of PLT stubs.
Reviewed by Hal Finkel.
llvm-svn: 213489
2014-07-21 07:31:44 +08:00
|
|
|
/// EmitFunctionBodyStart - Emit a global entry point prefix for ELFv2.
|
|
|
|
void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
|
|
|
|
// In the ELFv2 ABI, in functions that use the TOC register, we need to
|
|
|
|
// provide two entry points. The ABI guarantees that when calling the
|
|
|
|
// local entry point, r2 is set up by the caller to contain the TOC base
|
|
|
|
// for this function, and when calling the global entry point, r12 is set
|
|
|
|
// up by the caller to hold the address of the global entry point. We
|
|
|
|
// thus emit a prefix sequence along the following lines:
|
|
|
|
//
|
|
|
|
// func:
|
|
|
|
// # global entry point
|
|
|
|
// addis r2,r12,(.TOC.-func)@ha
|
|
|
|
// addi r2,r2,(.TOC.-func)@l
|
|
|
|
// .localentry func, .-func
|
|
|
|
// # local entry point, followed by function body
|
|
|
|
//
|
|
|
|
// This ensures we have r2 set up correctly while executing the function
|
|
|
|
// body, no matter which entry point is called.
|
|
|
|
if (Subtarget.isELFv2ABI()
|
|
|
|
// Only do all that if the function uses r2 in the first place.
|
|
|
|
&& !MF->getRegInfo().use_empty(PPC::X2)) {
|
|
|
|
|
|
|
|
MCSymbol *GlobalEntryLabel = OutContext.CreateTempSymbol();
|
|
|
|
OutStreamer.EmitLabel(GlobalEntryLabel);
|
|
|
|
const MCSymbolRefExpr *GlobalEntryLabelExp =
|
|
|
|
MCSymbolRefExpr::Create(GlobalEntryLabel, OutContext);
|
|
|
|
|
|
|
|
MCSymbol *TOCSymbol = OutContext.GetOrCreateSymbol(StringRef(".TOC."));
|
|
|
|
const MCExpr *TOCDeltaExpr =
|
|
|
|
MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(TOCSymbol, OutContext),
|
|
|
|
GlobalEntryLabelExp, OutContext);
|
|
|
|
|
|
|
|
const MCExpr *TOCDeltaHi =
|
|
|
|
PPCMCExpr::CreateHa(TOCDeltaExpr, false, OutContext);
|
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS)
|
|
|
|
.addReg(PPC::X2)
|
|
|
|
.addReg(PPC::X12)
|
|
|
|
.addExpr(TOCDeltaHi));
|
|
|
|
|
|
|
|
const MCExpr *TOCDeltaLo =
|
|
|
|
PPCMCExpr::CreateLo(TOCDeltaExpr, false, OutContext);
|
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDI)
|
|
|
|
.addReg(PPC::X2)
|
|
|
|
.addReg(PPC::X2)
|
|
|
|
.addExpr(TOCDeltaLo));
|
|
|
|
|
|
|
|
MCSymbol *LocalEntryLabel = OutContext.CreateTempSymbol();
|
|
|
|
OutStreamer.EmitLabel(LocalEntryLabel);
|
|
|
|
const MCSymbolRefExpr *LocalEntryLabelExp =
|
|
|
|
MCSymbolRefExpr::Create(LocalEntryLabel, OutContext);
|
|
|
|
const MCExpr *LocalOffsetExp =
|
|
|
|
MCBinaryExpr::CreateSub(LocalEntryLabelExp,
|
|
|
|
GlobalEntryLabelExp, OutContext);
|
|
|
|
|
|
|
|
PPCTargetStreamer *TS =
|
|
|
|
static_cast<PPCTargetStreamer *>(OutStreamer.getTargetStreamer());
|
|
|
|
|
|
|
|
if (TS)
|
|
|
|
TS->emitLocalEntry(CurrentFnSym, LocalOffsetExp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-08-29 03:06:55 +08:00
|
|
|
/// EmitFunctionBodyEnd - Print the traceback table before the .size
|
|
|
|
/// directive.
|
|
|
|
///
|
|
|
|
void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() {
|
|
|
|
// Only the 64-bit target requires a traceback table. For now,
|
|
|
|
// we only emit the word of zeroes that GDB requires to find
|
2012-08-30 04:22:24 +08:00
|
|
|
// the end of the function, and zeroes for the eight-byte
|
|
|
|
// mandatory fields.
|
|
|
|
// FIXME: We should fill in the eight-byte mandatory fields as described in
|
|
|
|
// the PPC64 ELF ABI (this is a low-priority item because GDB does not
|
|
|
|
// currently make use of these fields).
|
|
|
|
if (Subtarget.isPPC64()) {
|
2012-08-29 03:06:55 +08:00
|
|
|
OutStreamer.EmitIntValue(0, 4/*size*/);
|
2012-08-30 04:22:24 +08:00
|
|
|
OutStreamer.EmitIntValue(0, 8/*size*/);
|
|
|
|
}
|
2012-08-29 03:06:55 +08:00
|
|
|
}
|
|
|
|
|
2009-10-01 06:06:26 +08:00
|
|
|
void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
|
2008-03-26 05:45:14 +08:00
|
|
|
static const char *const CPUDirectives[] = {
|
2008-02-15 07:35:16 +08:00
|
|
|
"",
|
2006-12-13 04:57:08 +08:00
|
|
|
"ppc",
|
2011-10-17 12:03:49 +08:00
|
|
|
"ppc440",
|
2006-12-13 04:57:08 +08:00
|
|
|
"ppc601",
|
|
|
|
"ppc602",
|
|
|
|
"ppc603",
|
|
|
|
"ppc7400",
|
|
|
|
"ppc750",
|
|
|
|
"ppc970",
|
2012-04-02 03:22:40 +08:00
|
|
|
"ppcA2",
|
2012-08-29 00:12:39 +08:00
|
|
|
"ppce500mc",
|
|
|
|
"ppce5500",
|
2013-02-04 08:47:33 +08:00
|
|
|
"power3",
|
|
|
|
"power4",
|
|
|
|
"power5",
|
|
|
|
"power5x",
|
2012-06-11 23:43:08 +08:00
|
|
|
"power6",
|
2013-02-04 08:47:33 +08:00
|
|
|
"power6x",
|
2012-06-11 23:43:08 +08:00
|
|
|
"power7",
|
2013-07-26 09:35:43 +08:00
|
|
|
"ppc64",
|
|
|
|
"ppc64le"
|
2006-12-13 04:57:08 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
unsigned Directive = Subtarget.getDarwinDirective();
|
2012-06-12 03:57:01 +08:00
|
|
|
if (Subtarget.hasMFOCRF() && Directive < PPC::DIR_970)
|
2006-12-13 04:57:08 +08:00
|
|
|
Directive = PPC::DIR_970;
|
|
|
|
if (Subtarget.hasAltivec() && Directive < PPC::DIR_7400)
|
|
|
|
Directive = PPC::DIR_7400;
|
2011-12-01 09:43:47 +08:00
|
|
|
if (Subtarget.isPPC64() && Directive < PPC::DIR_64)
|
2006-12-13 04:57:08 +08:00
|
|
|
Directive = PPC::DIR_64;
|
|
|
|
assert(Directive <= PPC::DIR_64 && "Directive out of range.");
|
2014-01-25 10:35:56 +08:00
|
|
|
|
|
|
|
assert(Directive < array_lengthof(CPUDirectives) &&
|
|
|
|
"CPUDirectives[] might not be up-to-date!");
|
|
|
|
PPCTargetStreamer &TStreamer =
|
|
|
|
*static_cast<PPCTargetStreamer *>(OutStreamer.getTargetStreamer());
|
|
|
|
TStreamer.emitMachine(CPUDirectives[Directive]);
|
2008-08-09 02:22:59 +08:00
|
|
|
|
2006-11-29 02:21:52 +08:00
|
|
|
// Prime text sections so they are adjacent. This reduces the likelihood a
|
|
|
|
// large data or debug section causes a branch to exceed 16M limit.
|
2010-04-18 00:44:48 +08:00
|
|
|
const TargetLoweringObjectFileMachO &TLOFMacho =
|
|
|
|
static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
|
2009-08-19 13:49:37 +08:00
|
|
|
OutStreamer.SwitchSection(TLOFMacho.getTextCoalSection());
|
2006-11-29 02:21:52 +08:00
|
|
|
if (TM.getRelocationModel() == Reloc::PIC_) {
|
2009-08-19 13:49:37 +08:00
|
|
|
OutStreamer.SwitchSection(
|
2010-04-09 04:40:11 +08:00
|
|
|
OutContext.getMachOSection("__TEXT", "__picsymbolstub1",
|
2014-03-07 15:36:05 +08:00
|
|
|
MachO::S_SYMBOL_STUBS |
|
|
|
|
MachO::S_ATTR_PURE_INSTRUCTIONS,
|
2009-08-19 13:49:37 +08:00
|
|
|
32, SectionKind::getText()));
|
2006-11-29 02:21:52 +08:00
|
|
|
} else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) {
|
2009-08-19 13:49:37 +08:00
|
|
|
OutStreamer.SwitchSection(
|
2010-04-09 04:40:11 +08:00
|
|
|
OutContext.getMachOSection("__TEXT","__symbol_stub1",
|
2014-03-07 15:36:05 +08:00
|
|
|
MachO::S_SYMBOL_STUBS |
|
|
|
|
MachO::S_ATTR_PURE_INSTRUCTIONS,
|
2009-08-19 13:49:37 +08:00
|
|
|
16, SectionKind::getText()));
|
2006-11-29 02:21:52 +08:00
|
|
|
}
|
2009-08-19 13:49:37 +08:00
|
|
|
OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
|
2005-07-21 09:25:49 +08:00
|
|
|
}
|
|
|
|
|
2010-04-04 15:12:28 +08:00
|
|
|
static MCSymbol *GetLazyPtr(MCSymbol *Sym, MCContext &Ctx) {
|
2010-01-21 05:36:48 +08:00
|
|
|
// Remove $stub suffix, add $lazy_ptr.
|
2012-11-24 21:18:11 +08:00
|
|
|
StringRef NoStub = Sym->getName().substr(0, Sym->getName().size()-5);
|
|
|
|
return Ctx.GetOrCreateSymbol(NoStub + "$lazy_ptr");
|
2010-01-21 05:36:48 +08:00
|
|
|
}
|
|
|
|
|
2010-04-04 15:12:28 +08:00
|
|
|
static MCSymbol *GetAnonSym(MCSymbol *Sym, MCContext &Ctx) {
|
2010-01-21 05:36:48 +08:00
|
|
|
// Add $tmp suffix to $stub, yielding $stub$tmp.
|
2012-11-24 21:18:11 +08:00
|
|
|
return Ctx.GetOrCreateSymbol(Sym->getName() + "$tmp");
|
2010-01-21 05:36:48 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void PPCDarwinAsmPrinter::
|
|
|
|
EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
|
Revert the majority of the next patch in the address space series:
r165941: Resubmit the changes to llvm core to update the functions to
support different pointer sizes on a per address space basis.
Despite this commit log, this change primarily changed stuff outside of
VMCore, and those changes do not carry any tests for correctness (or
even plausibility), and we have consistently found questionable or flat
out incorrect cases in these changes. Most of them are probably correct,
but we need to devise a system that makes it more clear when we have
handled the address space concerns correctly, and ideally each pass that
gets updated would receive an accompanying test case that exercises that
pass specificaly w.r.t. alternate address spaces.
However, from this commit, I have retained the new C API entry points.
Those were an orthogonal change that probably should have been split
apart, but they seem entirely good.
In several places the changes were very obvious cleanups with no actual
multiple address space code added; these I have not reverted when
I spotted them.
In a few other places there were merge conflicts due to a cleaner
solution being implemented later, often not using address spaces at all.
In those cases, I've preserved the new code which isn't address space
dependent.
This is part of my ongoing effort to clean out the partial address space
code which carries high risk and low test coverage, and not likely to be
finished before the 3.2 release looms closer. Duncan and I would both
like to see the above issues addressed before we return to these
changes.
llvm-svn: 167222
2012-11-01 17:14:31 +08:00
|
|
|
bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
|
[PowerPC] Always use "assembler dialect" 1
A setting in MCAsmInfo defines the "assembler dialect" to use. This is used
by common code to choose between alternatives in a multi-alternative GNU
inline asm statement like the following:
__asm__ ("{sfe|subfe} %0,%1,%2" : "=r" (out) : "r" (in1), "r" (in2));
The meaning of these dialects is platform specific, and GCC defines those
for PowerPC to use dialect 0 for old-style (POWER) mnemonics and 1 for
new-style (PowerPC) mnemonics, like in the example above.
To be compatible with inline asm used with GCC, LLVM ought to do the same.
Specifically, this means we should always use assembler dialect 1 since
old-style mnemonics really aren't supported on any current platform.
However, the current LLVM back-end uses:
AssemblerDialect = 1; // New-Style mnemonics.
in PPCMCAsmInfoDarwin, and
AssemblerDialect = 0; // Old-Style mnemonics.
in PPCLinuxMCAsmInfo.
The Linux setting really isn't correct, we should be using new-style
mnemonics everywhere. This is changed by this commit.
Unfortunately, the setting of this variable is overloaded in the back-end
to decide whether or not we are on a Darwin target. This is done in
PPCInstPrinter (the "SyntaxVariant" is initialized from the MCAsmInfo
AssemblerDialect setting), and also in PPCMCExpr. Setting AssemblerDialect
to 1 for both Darwin and Linux no longer allows us to make this distinction.
Instead, this patch uses the MCSubtargetInfo passed to createPPCMCInstPrinter
to distinguish Darwin targets, and ignores the SyntaxVariant parameter.
As to PPCMCExpr, this patch adds an explicit isDarwin argument that needs
to be passed in by the caller when creating a target MCExpr. (To do so
this patch implicitly also reverts commit 184441.)
llvm-svn: 185858
2013-07-09 04:20:51 +08:00
|
|
|
bool isDarwin = Subtarget.isDarwin();
|
2010-01-21 05:19:44 +08:00
|
|
|
|
2010-04-18 00:44:48 +08:00
|
|
|
const TargetLoweringObjectFileMachO &TLOFMacho =
|
|
|
|
static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
|
2010-01-21 05:19:44 +08:00
|
|
|
|
|
|
|
// .lazy_symbol_pointer
|
|
|
|
const MCSection *LSPSection = TLOFMacho.getLazySymbolPointerSection();
|
2009-08-10 09:39:42 +08:00
|
|
|
|
2004-07-17 04:29:04 +08:00
|
|
|
// Output stubs for dynamically-linked functions
|
2010-01-21 05:19:44 +08:00
|
|
|
if (TM.getRelocationModel() == Reloc::PIC_) {
|
2009-08-04 06:52:21 +08:00
|
|
|
const MCSection *StubSection =
|
2010-04-09 04:40:11 +08:00
|
|
|
OutContext.getMachOSection("__TEXT", "__picsymbolstub1",
|
2014-03-07 15:36:05 +08:00
|
|
|
MachO::S_SYMBOL_STUBS |
|
|
|
|
MachO::S_ATTR_PURE_INSTRUCTIONS,
|
2010-04-09 04:40:11 +08:00
|
|
|
32, SectionKind::getText());
|
2010-01-21 05:36:48 +08:00
|
|
|
for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
|
2009-08-19 13:49:37 +08:00
|
|
|
OutStreamer.SwitchSection(StubSection);
|
2006-06-27 09:02:25 +08:00
|
|
|
EmitAlignment(4);
|
2010-01-21 05:36:48 +08:00
|
|
|
|
2010-04-04 15:05:53 +08:00
|
|
|
MCSymbol *Stub = Stubs[i].first;
|
2010-04-04 15:12:28 +08:00
|
|
|
MCSymbol *RawSym = Stubs[i].second.getPointer();
|
|
|
|
MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
|
|
|
|
MCSymbol *AnonSymbol = GetAnonSym(Stub, OutContext);
|
2010-01-21 05:36:48 +08:00
|
|
|
|
2010-04-04 15:05:53 +08:00
|
|
|
OutStreamer.EmitLabel(Stub);
|
2010-04-04 15:12:28 +08:00
|
|
|
OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
|
2012-11-24 21:18:25 +08:00
|
|
|
|
2013-03-24 04:53:15 +08:00
|
|
|
const MCExpr *Anon = MCSymbolRefExpr::Create(AnonSymbol, OutContext);
|
2013-05-24 06:26:41 +08:00
|
|
|
const MCExpr *LazyPtrExpr = MCSymbolRefExpr::Create(LazyPtr, OutContext);
|
|
|
|
const MCExpr *Sub =
|
|
|
|
MCBinaryExpr::CreateSub(LazyPtrExpr, Anon, OutContext);
|
2013-03-24 04:53:15 +08:00
|
|
|
|
2012-11-24 21:18:25 +08:00
|
|
|
// mflr r0
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R0));
|
2013-03-24 04:53:15 +08:00
|
|
|
// bcl 20, 31, AnonSymbol
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCLalways).addExpr(Anon));
|
2010-04-04 15:12:28 +08:00
|
|
|
OutStreamer.EmitLabel(AnonSymbol);
|
2012-11-24 21:18:25 +08:00
|
|
|
// mflr r11
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R11));
|
2012-11-24 21:18:25 +08:00
|
|
|
// addis r11, r11, ha16(LazyPtr - AnonSymbol)
|
[PowerPC] Always use "assembler dialect" 1
A setting in MCAsmInfo defines the "assembler dialect" to use. This is used
by common code to choose between alternatives in a multi-alternative GNU
inline asm statement like the following:
__asm__ ("{sfe|subfe} %0,%1,%2" : "=r" (out) : "r" (in1), "r" (in2));
The meaning of these dialects is platform specific, and GCC defines those
for PowerPC to use dialect 0 for old-style (POWER) mnemonics and 1 for
new-style (PowerPC) mnemonics, like in the example above.
To be compatible with inline asm used with GCC, LLVM ought to do the same.
Specifically, this means we should always use assembler dialect 1 since
old-style mnemonics really aren't supported on any current platform.
However, the current LLVM back-end uses:
AssemblerDialect = 1; // New-Style mnemonics.
in PPCMCAsmInfoDarwin, and
AssemblerDialect = 0; // Old-Style mnemonics.
in PPCLinuxMCAsmInfo.
The Linux setting really isn't correct, we should be using new-style
mnemonics everywhere. This is changed by this commit.
Unfortunately, the setting of this variable is overloaded in the back-end
to decide whether or not we are on a Darwin target. This is done in
PPCInstPrinter (the "SyntaxVariant" is initialized from the MCAsmInfo
AssemblerDialect setting), and also in PPCMCExpr. Setting AssemblerDialect
to 1 for both Darwin and Linux no longer allows us to make this distinction.
Instead, this patch uses the MCSubtargetInfo passed to createPPCMCInstPrinter
to distinguish Darwin targets, and ignores the SyntaxVariant parameter.
As to PPCMCExpr, this patch adds an explicit isDarwin argument that needs
to be passed in by the caller when creating a target MCExpr. (To do so
this patch implicitly also reverts commit 184441.)
llvm-svn: 185858
2013-07-09 04:20:51 +08:00
|
|
|
const MCExpr *SubHa16 = PPCMCExpr::CreateHa(Sub, isDarwin, OutContext);
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS)
|
2012-11-26 21:34:22 +08:00
|
|
|
.addReg(PPC::R11)
|
|
|
|
.addReg(PPC::R11)
|
2013-05-24 06:26:41 +08:00
|
|
|
.addExpr(SubHa16));
|
2012-11-24 21:18:25 +08:00
|
|
|
// mtlr r0
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTLR).addReg(PPC::R0));
|
2012-11-26 21:34:22 +08:00
|
|
|
|
|
|
|
// ldu r12, lo16(LazyPtr - AnonSymbol)(r11)
|
|
|
|
// lwzu r12, lo16(LazyPtr - AnonSymbol)(r11)
|
[PowerPC] Always use "assembler dialect" 1
A setting in MCAsmInfo defines the "assembler dialect" to use. This is used
by common code to choose between alternatives in a multi-alternative GNU
inline asm statement like the following:
__asm__ ("{sfe|subfe} %0,%1,%2" : "=r" (out) : "r" (in1), "r" (in2));
The meaning of these dialects is platform specific, and GCC defines those
for PowerPC to use dialect 0 for old-style (POWER) mnemonics and 1 for
new-style (PowerPC) mnemonics, like in the example above.
To be compatible with inline asm used with GCC, LLVM ought to do the same.
Specifically, this means we should always use assembler dialect 1 since
old-style mnemonics really aren't supported on any current platform.
However, the current LLVM back-end uses:
AssemblerDialect = 1; // New-Style mnemonics.
in PPCMCAsmInfoDarwin, and
AssemblerDialect = 0; // Old-Style mnemonics.
in PPCLinuxMCAsmInfo.
The Linux setting really isn't correct, we should be using new-style
mnemonics everywhere. This is changed by this commit.
Unfortunately, the setting of this variable is overloaded in the back-end
to decide whether or not we are on a Darwin target. This is done in
PPCInstPrinter (the "SyntaxVariant" is initialized from the MCAsmInfo
AssemblerDialect setting), and also in PPCMCExpr. Setting AssemblerDialect
to 1 for both Darwin and Linux no longer allows us to make this distinction.
Instead, this patch uses the MCSubtargetInfo passed to createPPCMCInstPrinter
to distinguish Darwin targets, and ignores the SyntaxVariant parameter.
As to PPCMCExpr, this patch adds an explicit isDarwin argument that needs
to be passed in by the caller when creating a target MCExpr. (To do so
this patch implicitly also reverts commit 184441.)
llvm-svn: 185858
2013-07-09 04:20:51 +08:00
|
|
|
const MCExpr *SubLo16 = PPCMCExpr::CreateLo(Sub, isDarwin, OutContext);
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
|
2012-11-26 21:34:22 +08:00
|
|
|
.addReg(PPC::R12)
|
2013-05-24 06:26:41 +08:00
|
|
|
.addExpr(SubLo16).addExpr(SubLo16)
|
2012-11-27 02:05:52 +08:00
|
|
|
.addReg(PPC::R11));
|
2012-11-24 21:18:25 +08:00
|
|
|
// mtctr r12
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR).addReg(PPC::R12));
|
2012-11-24 21:18:25 +08:00
|
|
|
// bctr
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTR));
|
2012-11-24 21:18:25 +08:00
|
|
|
|
2009-08-19 13:49:37 +08:00
|
|
|
OutStreamer.SwitchSection(LSPSection);
|
2010-04-04 15:12:28 +08:00
|
|
|
OutStreamer.EmitLabel(LazyPtr);
|
|
|
|
OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
|
2012-11-24 21:18:25 +08:00
|
|
|
|
|
|
|
MCSymbol *DyldStubBindingHelper =
|
|
|
|
OutContext.GetOrCreateSymbol(StringRef("dyld_stub_binding_helper"));
|
|
|
|
if (isPPC64) {
|
|
|
|
// .quad dyld_stub_binding_helper
|
|
|
|
OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 8);
|
|
|
|
} else {
|
|
|
|
// .long dyld_stub_binding_helper
|
|
|
|
OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 4);
|
|
|
|
}
|
2005-12-13 12:33:58 +08:00
|
|
|
}
|
2010-04-04 15:05:53 +08:00
|
|
|
OutStreamer.AddBlankLine();
|
2010-01-21 05:19:44 +08:00
|
|
|
return;
|
2004-06-25 07:04:11 +08:00
|
|
|
}
|
2010-01-21 05:19:44 +08:00
|
|
|
|
|
|
|
const MCSection *StubSection =
|
2010-04-09 04:40:11 +08:00
|
|
|
OutContext.getMachOSection("__TEXT","__symbol_stub1",
|
2014-03-07 15:36:05 +08:00
|
|
|
MachO::S_SYMBOL_STUBS |
|
|
|
|
MachO::S_ATTR_PURE_INSTRUCTIONS,
|
2010-04-09 04:40:11 +08:00
|
|
|
16, SectionKind::getText());
|
2010-01-21 05:36:48 +08:00
|
|
|
for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
|
2010-04-04 15:05:53 +08:00
|
|
|
MCSymbol *Stub = Stubs[i].first;
|
2010-04-04 15:12:28 +08:00
|
|
|
MCSymbol *RawSym = Stubs[i].second.getPointer();
|
|
|
|
MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
|
2013-05-24 06:26:41 +08:00
|
|
|
const MCExpr *LazyPtrExpr = MCSymbolRefExpr::Create(LazyPtr, OutContext);
|
2010-01-21 05:36:48 +08:00
|
|
|
|
2010-01-21 05:19:44 +08:00
|
|
|
OutStreamer.SwitchSection(StubSection);
|
|
|
|
EmitAlignment(4);
|
2010-04-04 15:05:53 +08:00
|
|
|
OutStreamer.EmitLabel(Stub);
|
2010-04-04 15:12:28 +08:00
|
|
|
OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
|
2013-05-24 06:26:41 +08:00
|
|
|
|
2012-11-24 21:18:25 +08:00
|
|
|
// lis r11, ha16(LazyPtr)
|
[PowerPC] Always use "assembler dialect" 1
A setting in MCAsmInfo defines the "assembler dialect" to use. This is used
by common code to choose between alternatives in a multi-alternative GNU
inline asm statement like the following:
__asm__ ("{sfe|subfe} %0,%1,%2" : "=r" (out) : "r" (in1), "r" (in2));
The meaning of these dialects is platform specific, and GCC defines those
for PowerPC to use dialect 0 for old-style (POWER) mnemonics and 1 for
new-style (PowerPC) mnemonics, like in the example above.
To be compatible with inline asm used with GCC, LLVM ought to do the same.
Specifically, this means we should always use assembler dialect 1 since
old-style mnemonics really aren't supported on any current platform.
However, the current LLVM back-end uses:
AssemblerDialect = 1; // New-Style mnemonics.
in PPCMCAsmInfoDarwin, and
AssemblerDialect = 0; // Old-Style mnemonics.
in PPCLinuxMCAsmInfo.
The Linux setting really isn't correct, we should be using new-style
mnemonics everywhere. This is changed by this commit.
Unfortunately, the setting of this variable is overloaded in the back-end
to decide whether or not we are on a Darwin target. This is done in
PPCInstPrinter (the "SyntaxVariant" is initialized from the MCAsmInfo
AssemblerDialect setting), and also in PPCMCExpr. Setting AssemblerDialect
to 1 for both Darwin and Linux no longer allows us to make this distinction.
Instead, this patch uses the MCSubtargetInfo passed to createPPCMCInstPrinter
to distinguish Darwin targets, and ignores the SyntaxVariant parameter.
As to PPCMCExpr, this patch adds an explicit isDarwin argument that needs
to be passed in by the caller when creating a target MCExpr. (To do so
this patch implicitly also reverts commit 184441.)
llvm-svn: 185858
2013-07-09 04:20:51 +08:00
|
|
|
const MCExpr *LazyPtrHa16 =
|
|
|
|
PPCMCExpr::CreateHa(LazyPtrExpr, isDarwin, OutContext);
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LIS)
|
2012-11-26 21:34:22 +08:00
|
|
|
.addReg(PPC::R11)
|
2012-11-27 02:05:52 +08:00
|
|
|
.addExpr(LazyPtrHa16));
|
2012-11-26 21:34:22 +08:00
|
|
|
|
|
|
|
// ldu r12, lo16(LazyPtr)(r11)
|
|
|
|
// lwzu r12, lo16(LazyPtr)(r11)
|
[PowerPC] Always use "assembler dialect" 1
A setting in MCAsmInfo defines the "assembler dialect" to use. This is used
by common code to choose between alternatives in a multi-alternative GNU
inline asm statement like the following:
__asm__ ("{sfe|subfe} %0,%1,%2" : "=r" (out) : "r" (in1), "r" (in2));
The meaning of these dialects is platform specific, and GCC defines those
for PowerPC to use dialect 0 for old-style (POWER) mnemonics and 1 for
new-style (PowerPC) mnemonics, like in the example above.
To be compatible with inline asm used with GCC, LLVM ought to do the same.
Specifically, this means we should always use assembler dialect 1 since
old-style mnemonics really aren't supported on any current platform.
However, the current LLVM back-end uses:
AssemblerDialect = 1; // New-Style mnemonics.
in PPCMCAsmInfoDarwin, and
AssemblerDialect = 0; // Old-Style mnemonics.
in PPCLinuxMCAsmInfo.
The Linux setting really isn't correct, we should be using new-style
mnemonics everywhere. This is changed by this commit.
Unfortunately, the setting of this variable is overloaded in the back-end
to decide whether or not we are on a Darwin target. This is done in
PPCInstPrinter (the "SyntaxVariant" is initialized from the MCAsmInfo
AssemblerDialect setting), and also in PPCMCExpr. Setting AssemblerDialect
to 1 for both Darwin and Linux no longer allows us to make this distinction.
Instead, this patch uses the MCSubtargetInfo passed to createPPCMCInstPrinter
to distinguish Darwin targets, and ignores the SyntaxVariant parameter.
As to PPCMCExpr, this patch adds an explicit isDarwin argument that needs
to be passed in by the caller when creating a target MCExpr. (To do so
this patch implicitly also reverts commit 184441.)
llvm-svn: 185858
2013-07-09 04:20:51 +08:00
|
|
|
const MCExpr *LazyPtrLo16 =
|
|
|
|
PPCMCExpr::CreateLo(LazyPtrExpr, isDarwin, OutContext);
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
|
2012-11-26 21:34:22 +08:00
|
|
|
.addReg(PPC::R12)
|
|
|
|
.addExpr(LazyPtrLo16).addExpr(LazyPtrLo16)
|
2012-11-27 02:05:52 +08:00
|
|
|
.addReg(PPC::R11));
|
2012-11-26 21:34:22 +08:00
|
|
|
|
2012-11-24 21:18:25 +08:00
|
|
|
// mtctr r12
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR).addReg(PPC::R12));
|
2012-11-24 21:18:25 +08:00
|
|
|
// bctr
|
2014-01-29 07:12:42 +08:00
|
|
|
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTR));
|
2012-11-26 21:34:22 +08:00
|
|
|
|
2010-01-21 05:19:44 +08:00
|
|
|
OutStreamer.SwitchSection(LSPSection);
|
2010-04-04 15:12:28 +08:00
|
|
|
OutStreamer.EmitLabel(LazyPtr);
|
|
|
|
OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
|
2012-11-24 21:18:25 +08:00
|
|
|
|
|
|
|
MCSymbol *DyldStubBindingHelper =
|
|
|
|
OutContext.GetOrCreateSymbol(StringRef("dyld_stub_binding_helper"));
|
|
|
|
if (isPPC64) {
|
|
|
|
// .quad dyld_stub_binding_helper
|
|
|
|
OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 8);
|
|
|
|
} else {
|
|
|
|
// .long dyld_stub_binding_helper
|
|
|
|
OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 4);
|
|
|
|
}
|
2010-01-21 05:19:44 +08:00
|
|
|
}
|
|
|
|
|
2010-04-04 15:12:28 +08:00
|
|
|
OutStreamer.AddBlankLine();
|
2010-01-21 05:19:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
|
Revert the majority of the next patch in the address space series:
r165941: Resubmit the changes to llvm core to update the functions to
support different pointer sizes on a per address space basis.
Despite this commit log, this change primarily changed stuff outside of
VMCore, and those changes do not carry any tests for correctness (or
even plausibility), and we have consistently found questionable or flat
out incorrect cases in these changes. Most of them are probably correct,
but we need to devise a system that makes it more clear when we have
handled the address space concerns correctly, and ideally each pass that
gets updated would receive an accompanying test case that exercises that
pass specificaly w.r.t. alternate address spaces.
However, from this commit, I have retained the new C API entry points.
Those were an orthogonal change that probably should have been split
apart, but they seem entirely good.
In several places the changes were very obvious cleanups with no actual
multiple address space code added; these I have not reverted when
I spotted them.
In a few other places there were merge conflicts due to a cleaner
solution being implemented later, often not using address spaces at all.
In those cases, I've preserved the new code which isn't address space
dependent.
This is part of my ongoing effort to clean out the partial address space
code which carries high risk and low test coverage, and not likely to be
finished before the 3.2 release looms closer. Duncan and I would both
like to see the above issues addressed before we return to these
changes.
llvm-svn: 167222
2012-11-01 17:14:31 +08:00
|
|
|
bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
|
2010-01-21 05:19:44 +08:00
|
|
|
|
|
|
|
// Darwin/PPC always uses mach-o.
|
2010-04-18 00:44:48 +08:00
|
|
|
const TargetLoweringObjectFileMachO &TLOFMacho =
|
|
|
|
static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
|
2010-01-21 05:19:44 +08:00
|
|
|
MachineModuleInfoMachO &MMIMacho =
|
|
|
|
MMI->getObjFileInfo<MachineModuleInfoMachO>();
|
|
|
|
|
2010-01-21 05:36:48 +08:00
|
|
|
MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetFnStubList();
|
|
|
|
if (!Stubs.empty())
|
|
|
|
EmitFunctionStubs(Stubs);
|
2004-07-17 04:29:04 +08:00
|
|
|
|
2009-08-23 05:43:10 +08:00
|
|
|
if (MAI->doesSupportExceptionHandling() && MMI) {
|
2007-11-21 07:24:42 +08:00
|
|
|
// Add the (possibly multiple) personalities to the set of global values.
|
2008-04-02 08:25:04 +08:00
|
|
|
// Only referenced functions get into the Personalities list.
|
2010-04-15 09:51:59 +08:00
|
|
|
const std::vector<const Function*> &Personalities = MMI->getPersonalities();
|
|
|
|
for (std::vector<const Function*>::const_iterator I = Personalities.begin(),
|
2009-07-15 09:14:44 +08:00
|
|
|
E = Personalities.end(); I != E; ++I) {
|
2010-01-21 05:16:14 +08:00
|
|
|
if (*I) {
|
2013-12-03 00:25:47 +08:00
|
|
|
MCSymbol *NLPSym = getSymbolWithGlobalValueBase(*I, "$non_lazy_ptr");
|
2010-03-11 06:34:10 +08:00
|
|
|
MachineModuleInfoImpl::StubValueTy &StubSym =
|
|
|
|
MMIMacho.getGVStubEntry(NLPSym);
|
2013-10-30 01:07:16 +08:00
|
|
|
StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(*I), true);
|
2010-01-21 05:16:14 +08:00
|
|
|
}
|
2009-07-15 09:14:44 +08:00
|
|
|
}
|
2007-11-21 07:24:42 +08:00
|
|
|
}
|
|
|
|
|
2010-01-21 05:16:14 +08:00
|
|
|
// Output stubs for dynamically-linked functions.
|
2010-01-21 05:36:48 +08:00
|
|
|
Stubs = MMIMacho.GetGVStubList();
|
2010-01-21 05:16:14 +08:00
|
|
|
|
2009-08-04 06:52:21 +08:00
|
|
|
// Output macho stubs for external and common global variables.
|
2010-01-21 05:16:14 +08:00
|
|
|
if (!Stubs.empty()) {
|
2009-08-10 09:39:42 +08:00
|
|
|
// Switch with ".non_lazy_symbol_pointer" directive.
|
2009-08-19 13:49:37 +08:00
|
|
|
OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
|
2009-08-11 01:58:51 +08:00
|
|
|
EmitAlignment(isPPC64 ? 3 : 2);
|
|
|
|
|
2010-01-21 05:16:14 +08:00
|
|
|
for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
|
2010-03-12 07:39:44 +08:00
|
|
|
// L_foo$stub:
|
|
|
|
OutStreamer.EmitLabel(Stubs[i].first);
|
|
|
|
// .indirect_symbol _foo
|
|
|
|
MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
|
|
|
|
OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
|
2010-03-12 10:00:43 +08:00
|
|
|
|
|
|
|
if (MCSym.getInt())
|
|
|
|
// External to current translation unit.
|
2013-01-09 09:57:54 +08:00
|
|
|
OutStreamer.EmitIntValue(0, isPPC64 ? 8 : 4/*size*/);
|
2010-03-12 10:00:43 +08:00
|
|
|
else
|
|
|
|
// Internal to current translation unit.
|
2010-04-01 02:47:10 +08:00
|
|
|
//
|
|
|
|
// When we place the LSDA into the TEXT section, the type info pointers
|
|
|
|
// need to be indirect and pc-rel. We accomplish this by using NLPs.
|
|
|
|
// However, sometimes the types are local to the file. So we need to
|
|
|
|
// fill in the value for the NLP in those cases.
|
2010-03-12 10:00:43 +08:00
|
|
|
OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
|
|
|
|
OutContext),
|
2013-01-09 09:57:54 +08:00
|
|
|
isPPC64 ? 8 : 4/*size*/);
|
2005-12-13 12:33:58 +08:00
|
|
|
}
|
2010-03-12 07:39:44 +08:00
|
|
|
|
|
|
|
Stubs.clear();
|
|
|
|
OutStreamer.AddBlankLine();
|
2004-08-15 06:09:10 +08:00
|
|
|
}
|
2005-04-22 07:30:14 +08:00
|
|
|
|
2010-01-21 05:16:14 +08:00
|
|
|
Stubs = MMIMacho.GetHiddenGVStubList();
|
|
|
|
if (!Stubs.empty()) {
|
2009-08-19 13:49:37 +08:00
|
|
|
OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
|
2009-07-15 09:14:44 +08:00
|
|
|
EmitAlignment(isPPC64 ? 3 : 2);
|
2010-01-21 05:16:14 +08:00
|
|
|
|
|
|
|
for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
|
2010-03-12 07:39:44 +08:00
|
|
|
// L_foo$stub:
|
|
|
|
OutStreamer.EmitLabel(Stubs[i].first);
|
|
|
|
// .long _foo
|
|
|
|
OutStreamer.EmitValue(MCSymbolRefExpr::
|
|
|
|
Create(Stubs[i].second.getPointer(),
|
|
|
|
OutContext),
|
2013-01-09 09:57:54 +08:00
|
|
|
isPPC64 ? 8 : 4/*size*/);
|
2008-12-05 09:06:39 +08:00
|
|
|
}
|
2010-03-12 07:39:44 +08:00
|
|
|
|
|
|
|
Stubs.clear();
|
|
|
|
OutStreamer.AddBlankLine();
|
2008-12-05 09:06:39 +08:00
|
|
|
}
|
|
|
|
|
2005-11-01 08:12:36 +08:00
|
|
|
// Funny Darwin hack: This flag tells the linker that no global symbols
|
|
|
|
// contain code that falls through to other global symbols (e.g. the obvious
|
|
|
|
// implementation of multiple entry points). If this doesn't occur, the
|
|
|
|
// linker can safely perform dead code stripping. Since LLVM never generates
|
|
|
|
// code that does this, it is always safe to set.
|
2010-01-23 14:39:22 +08:00
|
|
|
OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
|
2005-11-01 08:12:36 +08:00
|
|
|
|
2007-07-26 03:33:14 +08:00
|
|
|
return AsmPrinter::doFinalization(M);
|
2004-06-22 00:55:25 +08:00
|
|
|
}
|
2004-09-04 13:00:00 +08:00
|
|
|
|
2006-12-21 04:56:46 +08:00
|
|
|
/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
|
|
|
|
/// for a MachineFunction to the given output stream, in a format that the
|
2006-09-21 01:12:19 +08:00
|
|
|
/// Darwin assembler can deal with.
|
|
|
|
///
|
2010-04-04 16:18:47 +08:00
|
|
|
static AsmPrinter *createPPCAsmPrinterPass(TargetMachine &tm,
|
2010-03-14 04:55:24 +08:00
|
|
|
MCStreamer &Streamer) {
|
2006-12-22 04:26:09 +08:00
|
|
|
const PPCSubtarget *Subtarget = &tm.getSubtarget<PPCSubtarget>();
|
|
|
|
|
2009-07-22 02:38:57 +08:00
|
|
|
if (Subtarget->isDarwin())
|
2010-04-04 16:18:47 +08:00
|
|
|
return new PPCDarwinAsmPrinter(tm, Streamer);
|
|
|
|
return new PPCLinuxAsmPrinter(tm, Streamer);
|
2006-09-21 01:12:19 +08:00
|
|
|
}
|
2008-08-17 21:54:28 +08:00
|
|
|
|
2009-06-24 07:59:40 +08:00
|
|
|
// Force static initialization.
|
2009-07-16 04:24:03 +08:00
|
|
|
extern "C" void LLVMInitializePowerPCAsmPrinter() {
|
|
|
|
TargetRegistry::RegisterAsmPrinter(ThePPC32Target, createPPCAsmPrinterPass);
|
|
|
|
TargetRegistry::RegisterAsmPrinter(ThePPC64Target, createPPCAsmPrinterPass);
|
2014-03-05 00:51:52 +08:00
|
|
|
TargetRegistry::RegisterAsmPrinter(ThePPC64LETarget, createPPCAsmPrinterPass);
|
2009-07-16 04:24:03 +08:00
|
|
|
}
|