2006-09-04 12:14:57 +08:00
|
|
|
//===-- PPC.h - Top-level interface for PowerPC Target ----------*- C++ -*-===//
|
2005-04-22 07:30:14 +08:00
|
|
|
//
|
2004-06-22 00:55:25 +08:00
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 04:36:04 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2005-04-22 07:30:14 +08:00
|
|
|
//
|
2004-06-22 00:55:25 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This file contains the entry points for global functions defined in the LLVM
|
|
|
|
// PowerPC back-end.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2014-08-14 00:26:38 +08:00
|
|
|
#ifndef LLVM_LIB_TARGET_POWERPC_PPC_H
|
|
|
|
#define LLVM_LIB_TARGET_POWERPC_PPC_H
|
2004-06-22 00:55:25 +08:00
|
|
|
|
2011-07-15 04:59:42 +08:00
|
|
|
#include "MCTargetDesc/PPCMCTargetDesc.h"
|
2010-11-15 16:49:58 +08:00
|
|
|
#include <string>
|
|
|
|
|
2006-11-04 13:27:39 +08:00
|
|
|
// GCC #defines PPC on Linux but we use it as our namespace name
|
|
|
|
#undef PPC
|
2006-08-24 05:08:52 +08:00
|
|
|
|
2006-11-04 13:27:39 +08:00
|
|
|
namespace llvm {
|
|
|
|
class PPCTargetMachine;
|
[PowerPC] Select between VSX A-type and M-type FMA instructions just before RA
The VSX instruction set has two types of FMA instructions: A-type (where the
addend is taken from the output register) and M-type (where one of the product
operands is taken from the output register). This adds a small pass that runs
just after MI scheduling (and, thus, just before register allocation) that
mutates A-type instructions (that are created during isel) into M-type
instructions when:
1. This will eliminate an otherwise-necessary copy of the addend
2. One of the product operands is killed by the instruction
The "right" moment to make this decision is in between scheduling and register
allocation, because only there do we know whether or not one of the product
operands is killed by any particular instruction. Unfortunately, this also
makes the implementation somewhat complicated, because the MIs are not in SSA
form and we need to preserve the LiveIntervals analysis.
As a simple example, if we have:
%vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
%RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
...
%vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
%RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
...
We can eliminate the copy by changing from the A-type to the
M-type instruction. This means:
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
%RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
is replaced by:
%vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
%RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
llvm-svn: 204768
2014-03-26 07:29:21 +08:00
|
|
|
class PassRegistry;
|
2006-11-04 13:27:39 +08:00
|
|
|
class FunctionPass;
|
2013-01-26 07:05:59 +08:00
|
|
|
class ImmutablePass;
|
2010-11-15 05:12:33 +08:00
|
|
|
class MachineInstr;
|
|
|
|
class AsmPrinter;
|
Implement a basic MCCodeEmitter for PPC. This doesn't handle
fixups yet, and doesn't handle actually encoding operand values,
but this is enough for llc -show-mc-encoding to show the base
instruction encoding information, e.g.:
mflr r0 ; encoding: [0x7c,0x08,0x02,0xa6]
stw r0, 8(r1) ; encoding: [0x90,0x00,0x00,0x00]
stwu r1, -64(r1) ; encoding: [0x94,0x00,0x00,0x00]
Ltmp0:
lhz r4, 4(r3) ; encoding: [0xa0,0x00,0x00,0x00]
cmplwi cr0, r4, 8 ; encoding: [0x28,0x00,0x00,0x00]
beq cr0, LBB0_2 ; encoding: [0x40,0x00,0x00,0x00]
llvm-svn: 119116
2010-11-15 12:16:32 +08:00
|
|
|
class MCInst;
|
2012-03-18 02:46:09 +08:00
|
|
|
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
llvm-svn: 181927
2013-05-16 05:37:41 +08:00
|
|
|
FunctionPass *createPPCCTRLoops(PPCTargetMachine &TM);
|
Add a PPCCTRLoops verification pass
When asserts are enabled, this adds a verification pass for PPC counter-loop
formation. Unfortunately, without sacrificing code quality, there is no better
way of forming counter-based loops except at the (late) IR level. This means
that we need to recognize, at the IR level, anything which might turn into a
function call (or indirect branch). Because this is currently a finite set of
things, and because SelectionDAG lowering is basic-block local, this can be
done. Nevertheless, it is fragile, and failure results in a miscompile. This
verification pass checks that all (reachable) counter-based branches are
dominated by a loop mtctr instruction, and that no instructions in between
clobber the counter register. If these conditions are not satisfied, then an
ICE will be triggered.
In short, this is to help us sleep better at night.
llvm-svn: 182295
2013-05-21 00:08:17 +08:00
|
|
|
#ifndef NDEBUG
|
|
|
|
FunctionPass *createPPCCTRLoopsVerify();
|
|
|
|
#endif
|
2013-04-09 00:24:03 +08:00
|
|
|
FunctionPass *createPPCEarlyReturnPass();
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
FunctionPass *createPPCVSXCopyPass();
|
2014-03-28 07:12:31 +08:00
|
|
|
FunctionPass *createPPCVSXCopyCleanupPass();
|
[PowerPC] Select between VSX A-type and M-type FMA instructions just before RA
The VSX instruction set has two types of FMA instructions: A-type (where the
addend is taken from the output register) and M-type (where one of the product
operands is taken from the output register). This adds a small pass that runs
just after MI scheduling (and, thus, just before register allocation) that
mutates A-type instructions (that are created during isel) into M-type
instructions when:
1. This will eliminate an otherwise-necessary copy of the addend
2. One of the product operands is killed by the instruction
The "right" moment to make this decision is in between scheduling and register
allocation, because only there do we know whether or not one of the product
operands is killed by any particular instruction. Unfortunately, this also
makes the implementation somewhat complicated, because the MIs are not in SSA
form and we need to preserve the LiveIntervals analysis.
As a simple example, if we have:
%vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
%RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
...
%vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
%RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
...
We can eliminate the copy by changing from the A-type to the
M-type instruction. This means:
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
%RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
is replaced by:
%vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
%RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
llvm-svn: 204768
2014-03-26 07:29:21 +08:00
|
|
|
FunctionPass *createPPCVSXFMAMutatePass();
|
2010-11-15 11:13:19 +08:00
|
|
|
FunctionPass *createPPCBranchSelectionPass();
|
|
|
|
FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
|
|
|
|
void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
|
[PowerPC] Always use "assembler dialect" 1
A setting in MCAsmInfo defines the "assembler dialect" to use. This is used
by common code to choose between alternatives in a multi-alternative GNU
inline asm statement like the following:
__asm__ ("{sfe|subfe} %0,%1,%2" : "=r" (out) : "r" (in1), "r" (in2));
The meaning of these dialects is platform specific, and GCC defines those
for PowerPC to use dialect 0 for old-style (POWER) mnemonics and 1 for
new-style (PowerPC) mnemonics, like in the example above.
To be compatible with inline asm used with GCC, LLVM ought to do the same.
Specifically, this means we should always use assembler dialect 1 since
old-style mnemonics really aren't supported on any current platform.
However, the current LLVM back-end uses:
AssemblerDialect = 1; // New-Style mnemonics.
in PPCMCAsmInfoDarwin, and
AssemblerDialect = 0; // Old-Style mnemonics.
in PPCLinuxMCAsmInfo.
The Linux setting really isn't correct, we should be using new-style
mnemonics everywhere. This is changed by this commit.
Unfortunately, the setting of this variable is overloaded in the back-end
to decide whether or not we are on a Darwin target. This is done in
PPCInstPrinter (the "SyntaxVariant" is initialized from the MCAsmInfo
AssemblerDialect setting), and also in PPCMCExpr. Setting AssemblerDialect
to 1 for both Darwin and Linux no longer allows us to make this distinction.
Instead, this patch uses the MCSubtargetInfo passed to createPPCMCInstPrinter
to distinguish Darwin targets, and ignores the SyntaxVariant parameter.
As to PPCMCExpr, this patch adds an explicit isDarwin argument that needs
to be passed in by the caller when creating a target MCExpr. (To do so
this patch implicitly also reverts commit 184441.)
llvm-svn: 185858
2013-07-09 04:20:51 +08:00
|
|
|
AsmPrinter &AP, bool isDarwin);
|
2013-01-26 07:05:59 +08:00
|
|
|
|
[PowerPC] Select between VSX A-type and M-type FMA instructions just before RA
The VSX instruction set has two types of FMA instructions: A-type (where the
addend is taken from the output register) and M-type (where one of the product
operands is taken from the output register). This adds a small pass that runs
just after MI scheduling (and, thus, just before register allocation) that
mutates A-type instructions (that are created during isel) into M-type
instructions when:
1. This will eliminate an otherwise-necessary copy of the addend
2. One of the product operands is killed by the instruction
The "right" moment to make this decision is in between scheduling and register
allocation, because only there do we know whether or not one of the product
operands is killed by any particular instruction. Unfortunately, this also
makes the implementation somewhat complicated, because the MIs are not in SSA
form and we need to preserve the LiveIntervals analysis.
As a simple example, if we have:
%vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
%RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
...
%vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
%RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
...
We can eliminate the copy by changing from the A-type to the
M-type instruction. This means:
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
%RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
is replaced by:
%vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
%RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
llvm-svn: 204768
2014-03-26 07:29:21 +08:00
|
|
|
void initializePPCVSXFMAMutatePass(PassRegistry&);
|
|
|
|
extern char &PPCVSXFMAMutateID;
|
|
|
|
|
2010-11-15 07:42:06 +08:00
|
|
|
namespace PPCII {
|
|
|
|
|
|
|
|
/// Target Operand Flag enum.
|
|
|
|
enum TOF {
|
|
|
|
//===------------------------------------------------------------------===//
|
|
|
|
// PPC Specific MachineOperand flags.
|
|
|
|
MO_NO_FLAG,
|
|
|
|
|
2014-07-19 07:29:49 +08:00
|
|
|
/// MO_PLT_OR_STUB - On a symbol operand "FOO", this indicates that the
|
|
|
|
/// reference is actually to the "FOO$stub" or "FOO@plt" symbol. This is
|
|
|
|
/// used for calls and jumps to external functions on Tiger and earlier, and
|
|
|
|
/// for PIC calls on Linux and ELF systems.
|
|
|
|
MO_PLT_OR_STUB = 1,
|
2010-11-15 07:42:06 +08:00
|
|
|
|
2010-11-15 11:13:19 +08:00
|
|
|
/// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to
|
|
|
|
/// the function's picbase, e.g. lo16(symbol-picbase).
|
2013-02-21 08:05:29 +08:00
|
|
|
MO_PIC_FLAG = 2,
|
2010-11-15 10:46:57 +08:00
|
|
|
|
2010-11-15 11:13:19 +08:00
|
|
|
/// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to
|
|
|
|
/// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase).
|
2013-02-21 08:05:29 +08:00
|
|
|
MO_NLP_FLAG = 4,
|
2010-11-15 10:46:57 +08:00
|
|
|
|
2010-11-15 11:13:19 +08:00
|
|
|
/// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a
|
|
|
|
/// symbol with hidden visibility. This causes a different kind of
|
|
|
|
/// non-lazy-pointer to be generated.
|
2013-02-21 08:05:29 +08:00
|
|
|
MO_NLP_HIDDEN_FLAG = 8,
|
2012-06-05 01:36:38 +08:00
|
|
|
|
|
|
|
/// The next are not flags but distinct values.
|
2013-02-21 08:05:29 +08:00
|
|
|
MO_ACCESS_MASK = 0xf0,
|
2012-06-05 01:36:38 +08:00
|
|
|
|
2013-06-21 22:42:20 +08:00
|
|
|
/// MO_LO, MO_HA - lo16(symbol) and ha16(symbol)
|
|
|
|
MO_LO = 1 << 4,
|
|
|
|
MO_HA = 2 << 4,
|
2012-06-05 01:36:38 +08:00
|
|
|
|
2013-06-21 22:42:20 +08:00
|
|
|
MO_TPREL_LO = 4 << 4,
|
|
|
|
MO_TPREL_HA = 3 << 4,
|
2013-02-21 08:05:29 +08:00
|
|
|
|
|
|
|
/// These values identify relocations on immediates folded
|
|
|
|
/// into memory operations.
|
2013-06-21 22:42:20 +08:00
|
|
|
MO_DTPREL_LO = 5 << 4,
|
|
|
|
MO_TLSLD_LO = 6 << 4,
|
2013-07-05 20:22:36 +08:00
|
|
|
MO_TOC_LO = 7 << 4,
|
|
|
|
|
|
|
|
// Symbol for VK_PPC_TLS fixup attached to an ADD instruction
|
[PowerPC] Replace foul hackery with real calls to __tls_get_addr
My original support for the general dynamic and local dynamic TLS
models contained some fairly obtuse hacks to generate calls to
__tls_get_addr when lowering a TargetGlobalAddress. Rather than
generating real calls, special GET_TLS_ADDR nodes were used to wrap
the calls and only reveal them at assembly time. I attempted to
provide correct parameter and return values by chaining CopyToReg and
CopyFromReg nodes onto the GET_TLS_ADDR nodes, but this was also not
fully correct. Problems were seen with two back-to-back stores to TLS
variables, where the call sequences ended up overlapping with unhappy
results. Additionally, since these weren't real calls, the proper
register side effects of a call were not recorded, so clobbered values
were kept live across the calls.
The proper thing to do is to lower these into calls in the first
place. This is relatively straightforward; see the changes to
PPCTargetLowering::LowerGlobalTLSAddress() in PPCISelLowering.cpp.
The changes here are standard call lowering, except that we need to
track the fact that these calls will require a relocation. This is
done by adding a machine operand flag of MO_TLSLD or MO_TLSGD to the
TargetGlobalAddress operand that appears earlier in the sequence.
The calls to LowerCallTo() eventually find their way to
LowerCall_64SVR4() or LowerCall_32SVR4(), which call FinishCall(),
which calls PrepareCall(). In PrepareCall(), we detect the calls to
__tls_get_addr and immediately snag the TargetGlobalTLSAddress with
the annotated relocation information. This becomes an extra operand
on the call following the callee, which is expected for nodes of type
tlscall. We change the call opcode to CALL_TLS for this case. Back
in FinishCall(), we change it again to CALL_NOP_TLS for 64-bit only,
since we require a TOC-restore nop following the call for the 64-bit
ABIs.
During selection, patterns in PPCInstrInfo.td and PPCInstr64Bit.td
convert the CALL_TLS nodes into BL_TLS nodes, and convert the
CALL_NOP_TLS nodes into BL8_NOP_TLS nodes. This replaces the code
removed from PPCAsmPrinter.cpp, as the BL_TLS or BL8_NOP_TLS
nodes can now be emitted normally using their patterns and the
associated printTLSCall print method.
Finally, as a result of these changes, all references to get-tls-addr
in its various guises are no longer used, so they have been removed.
There are existing TLS tests to verify the changes haven't messed
anything up). I've added one new test that verifies that the problem
with the original code has been fixed.
llvm-svn: 221703
2014-11-12 04:44:09 +08:00
|
|
|
MO_TLS = 8 << 4,
|
|
|
|
|
|
|
|
// Symbols for VK_PPC_TLSGD and VK_PPC_TLSLD in __tls_get_addr
|
|
|
|
// call sequences.
|
|
|
|
MO_TLSLD = 9 << 4,
|
|
|
|
MO_TLSGD = 10 << 4
|
2010-11-15 07:42:06 +08:00
|
|
|
};
|
|
|
|
} // end namespace PPCII
|
|
|
|
|
2004-06-22 00:55:25 +08:00
|
|
|
} // end namespace llvm;
|
|
|
|
|
|
|
|
#endif
|