2005-10-16 13:39:50 +08:00
|
|
|
//===-- PPCTargetMachine.cpp - Define TargetMachine for PowerPC -----------===//
|
2005-04-22 07:30:14 +08:00
|
|
|
//
|
2004-06-22 00:55:25 +08:00
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 04:36:04 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2005-04-22 07:30:14 +08:00
|
|
|
//
|
2004-06-22 00:55:25 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
2005-04-22 07:30:14 +08:00
|
|
|
//
|
2005-08-16 07:47:04 +08:00
|
|
|
// Top-level implementation for the PowerPC target.
|
2004-06-22 00:55:25 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2005-10-15 07:59:06 +08:00
|
|
|
#include "PPCTargetMachine.h"
|
2012-03-18 02:46:09 +08:00
|
|
|
#include "PPC.h"
|
2015-01-14 19:23:27 +08:00
|
|
|
#include "PPCTargetObjectFile.h"
|
2015-01-31 19:17:59 +08:00
|
|
|
#include "PPCTargetTransformInfo.h"
|
2016-04-29 07:42:51 +08:00
|
|
|
#include "llvm/CodeGen/LiveVariables.h"
|
2012-02-03 13:12:41 +08:00
|
|
|
#include "llvm/CodeGen/Passes.h"
|
2016-05-10 11:21:59 +08:00
|
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
2014-10-06 14:45:36 +08:00
|
|
|
#include "llvm/IR/Function.h"
|
2015-02-13 18:01:29 +08:00
|
|
|
#include "llvm/IR/LegacyPassManager.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/MC/MCStreamer.h"
|
2012-06-08 23:38:21 +08:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
2009-07-15 04:18:05 +08:00
|
|
|
#include "llvm/Support/FormattedStream.h"
|
2011-08-25 02:08:43 +08:00
|
|
|
#include "llvm/Support/TargetRegistry.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Target/TargetOptions.h"
|
2014-11-21 12:35:51 +08:00
|
|
|
#include "llvm/Transforms/Scalar.h"
|
2004-06-22 00:55:25 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2012-06-08 23:38:21 +08:00
|
|
|
static cl::
|
2012-06-09 03:19:53 +08:00
|
|
|
opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
|
|
|
|
cl::desc("Disable CTR loops for PPC"));
|
2012-06-08 23:38:21 +08:00
|
|
|
|
[PowerPC] Prepare loops for pre-increment loads/stores
PowerPC supports pre-increment load/store instructions (except for Altivec/VSX
vector load/stores). Using these on embedded cores can be very important, but
most loops are not naturally set up to use them. We can often change that,
however, by placing loops into a non-canonical form. Generically, this means
transforming loops like this:
for (int i = 0; i < n; ++i)
array[i] = c;
to look like this:
T *p = array[-1];
for (int i = 0; i < n; ++i)
*++p = c;
the key point is that addresses accessed are pulled into dedicated PHIs and
"pre-decremented" in the loop preheader. This allows the use of pre-increment
load/store instructions without loop peeling.
A target-specific late IR-level pass (running post-LSR), PPCLoopPreIncPrep, is
introduced to perform this transformation. I've used this code out-of-tree for
generating code for the PPC A2 for over a year. Somewhat to my surprise,
running the test suite + externals on a P7 with this transformation enabled
showed no performance regressions, and one speedup:
External/SPEC/CINT2006/483.xalancbmk/483.xalancbmk
-2.32514% +/- 1.03736%
So I'm going to enable it on everything for now. I was surprised by this
because, on the POWER cores, these pre-increment load/store instructions are
cracked (and, thus, harder to schedule effectively). But seeing no regressions,
and feeling that it is generally easier to split instructions apart late than
it is to combine them late, this might be the better approach regardless.
In the future, we might want to integrate this functionality into LSR (but
currently LSR does not create new PHI nodes, so (for that and other reasons)
significant work would need to be done).
llvm-svn: 228328
2015-02-06 02:43:00 +08:00
|
|
|
static cl::
|
|
|
|
opt<bool> DisablePreIncPrep("disable-ppc-preinc-prep", cl::Hidden,
|
|
|
|
cl::desc("Disable PPC loop preinc prep"));
|
|
|
|
|
[PowerPC] Select between VSX A-type and M-type FMA instructions just before RA
The VSX instruction set has two types of FMA instructions: A-type (where the
addend is taken from the output register) and M-type (where one of the product
operands is taken from the output register). This adds a small pass that runs
just after MI scheduling (and, thus, just before register allocation) that
mutates A-type instructions (that are created during isel) into M-type
instructions when:
1. This will eliminate an otherwise-necessary copy of the addend
2. One of the product operands is killed by the instruction
The "right" moment to make this decision is in between scheduling and register
allocation, because only there do we know whether or not one of the product
operands is killed by any particular instruction. Unfortunately, this also
makes the implementation somewhat complicated, because the MIs are not in SSA
form and we need to preserve the LiveIntervals analysis.
As a simple example, if we have:
%vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
%RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
...
%vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
%RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
...
We can eliminate the copy by changing from the A-type to the
M-type instruction. This means:
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
%RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
is replaced by:
%vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
%RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
llvm-svn: 204768
2014-03-26 07:29:21 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
VSXFMAMutateEarly("schedule-ppc-vsx-fma-mutation-early",
|
|
|
|
cl::Hidden, cl::desc("Schedule VSX FMA instruction mutation early"));
|
|
|
|
|
[PPC64LE] Remove unnecessary swaps from lane-insensitive vector computations
This patch adds a new SSA MI pass that runs on little-endian PPC64
code with VSX enabled. Loads and stores of 4x32 and 2x64 vectors
without alignment constraints are accomplished for little-endian using
lxvd2x/xxswapd and xxswapd/stxvd2x. The existence of the additional
xxswapd instructions hurts performance in comparison with big-endian
code, but they are necessary in the general case to support correct
semantics.
However, the general case does not apply to most vector code. Many
vector instructions are lane-insensitive; they do not "care" which
lanes the parallel computations are performed within, provided that
the resulting data is stored into the correct locations. Thus this
pass looks for computations that perform only lane-insensitive
operations, and remove the unnecessary swaps from loads and stores in
such computations.
Future improvements will allow computations using certain
lane-sensitive operations to also be optimized in this manner, by
modifying the lane-sensitive operations to account for the permuted
order of the lanes. However, this patch only adds the infrastructure
to permit this; no lane-sensitive operations are optimized at this
time.
This code is heavily exercised by the various vectorizing applications
in the projects/test-suite tree. For the time being, I have only added
one simple test case to demonstrate what the pass is doing. Although
it is quite simple, it provides coverage for much of the code,
including the special case handling of copies and subreg-to-reg
operations feeding the swaps. I plan to add additional tests in the
future as I fill in more of the "special handling" code.
Two existing tests were affected, because they expected the swaps to
be present, but they are now removed.
llvm-svn: 235910
2015-04-28 03:57:34 +08:00
|
|
|
static cl::
|
|
|
|
opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden,
|
|
|
|
cl::desc("Disable VSX Swap Removal for PPC"));
|
|
|
|
|
2016-04-01 04:39:41 +08:00
|
|
|
static cl::
|
|
|
|
opt<bool> DisableQPXLoadSplat("disable-ppc-qpx-load-splat", cl::Hidden,
|
|
|
|
cl::desc("Disable QPX load splat simplification"));
|
|
|
|
|
2015-11-11 05:38:26 +08:00
|
|
|
static cl::
|
|
|
|
opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden,
|
|
|
|
cl::desc("Disable machine peepholes for PPC"));
|
|
|
|
|
2014-11-21 12:35:51 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
EnableGEPOpt("ppc-gep-opt", cl::Hidden,
|
|
|
|
cl::desc("Enable optimizations on complex GEPs"),
|
|
|
|
cl::init(true));
|
|
|
|
|
[PowerPC] Loop Data Prefetching for the BG/Q
The IBM BG/Q supercomputer's A2 cores have a hardware prefetching unit, the
L1P, but it does not prefetch directly into the A2's L1 cache. Instead, it
prefetches into its own L1P buffer, and the latency to access that buffer is
significantly higher than that to the L1 cache (although smaller than the
latency to the L2 cache). As a result, especially when multiple hardware
threads are not actively busy, explicitly prefetching data into the L1 cache is
advantageous.
I've been using this pass out-of-tree for data prefetching on the BG/Q for well
over a year, and it has worked quite well. It is enabled by default only for
the BG/Q, but can be enabled for other cores as well via a command-line option.
Eventually, we might want to add some TTI interfaces and move this into
Transforms/Scalar (there is nothing particularly target dependent about it,
although only machines like the BG/Q will benefit from its simplistic
strategy).
llvm-svn: 229966
2015-02-20 13:08:21 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
EnablePrefetch("enable-ppc-prefetching",
|
|
|
|
cl::desc("disable software prefetching on PPC"),
|
|
|
|
cl::init(false), cl::Hidden);
|
|
|
|
|
[PowerPC] Add extra r2 read deps on @toc@l relocations
If some commits are happy, and some commits are sad, this is a sad commit. It
is sad because it restricts instruction scheduling to work around a binutils
linker bug, and moreover, one that may never be fixed. On 2012-05-21, GCC was
updated not to produce code triggering this bug, and now we'll do the same...
When resolving an address using the ELF ABI TOC pointer, two relocations are
generally required: one for the high part and one for the low part. Only
the high part generally explicitly depends on r2 (the TOC pointer). And, so,
we might produce code like this:
.Ltmp526:
addis 3, 2, .LC12@toc@ha
.Ltmp1628:
std 2, 40(1)
ld 5, 0(27)
ld 2, 8(27)
ld 11, 16(27)
ld 3, .LC12@toc@l(3)
rldicl 4, 4, 0, 32
mtctr 5
bctrl
ld 2, 40(1)
And there is nothing wrong with this code, as such, but there is a linker bug
in binutils (https://sourceware.org/bugzilla/show_bug.cgi?id=18414) that will
misoptimize this code sequence to this:
nop
std r2,40(r1)
ld r5,0(r27)
ld r2,8(r27)
ld r11,16(r27)
ld r3,-32472(r2)
clrldi r4,r4,32
mtctr r5
bctrl
ld r2,40(r1)
because the linker does not know (and does not check) that the value in r2
changed in between the instruction using the .LC12@toc@ha (TOC-relative)
relocation and the instruction using the .LC12@toc@l(3) relocation.
Because it finds these instructions using the relocations (and not by
scanning the instructions), it has been asserted that there is no good way
to detect the change of r2 in between. As a result, this bug may never be
fixed (i.e. it may become part of the definition of the ABI). GCC was
updated to add extra dependencies on r2 to instructions using the @toc@l
relocations to avoid this problem, and we'll do the same here.
This is done as a separate pass because:
1. These extra r2 dependencies are not really properties of the
instructions, but rather due to a linker bug, and maybe one day we'll be
able to get rid of them when targeting linkers without this bug (and,
thus, keeping the logic centralized here will make that
straightforward).
2. There are ISel-level peephole optimizations that propagate the @toc@l
relocations to some user instructions, and so the exta dependencies do
not apply only to a fixed set of instructions (without undesirable
definition replication).
The test case was reduced with the help of bugpoint, with minimal cleaning. I'm
looking forward to our upcoming MI serialization support, and with that, much
better tests can be created.
llvm-svn: 237556
2015-05-18 14:25:59 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
EnableExtraTOCRegDeps("enable-ppc-extra-toc-reg-deps",
|
|
|
|
cl::desc("Add extra TOC register dependencies"),
|
|
|
|
cl::init(true), cl::Hidden);
|
|
|
|
|
2015-07-15 16:23:05 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
EnableMachineCombinerPass("ppc-machine-combiner",
|
|
|
|
cl::desc("Enable the machine combiner pass"),
|
|
|
|
cl::init(true), cl::Hidden);
|
|
|
|
|
2009-07-25 14:49:55 +08:00
|
|
|
extern "C" void LLVMInitializePowerPCTarget() {
|
|
|
|
// Register the targets
|
2012-02-03 13:12:30 +08:00
|
|
|
RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);
|
2009-07-25 14:49:55 +08:00
|
|
|
RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target);
|
2013-07-26 09:35:43 +08:00
|
|
|
RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget);
|
2015-12-08 04:50:29 +08:00
|
|
|
|
|
|
|
PassRegistry &PR = *PassRegistry::getPassRegistry();
|
|
|
|
initializePPCBoolRetToIntPass(PR);
|
2009-07-25 14:49:55 +08:00
|
|
|
}
|
2009-06-17 04:12:29 +08:00
|
|
|
|
2015-01-27 03:03:15 +08:00
|
|
|
/// Return the datalayout string of a subtarget.
|
|
|
|
static std::string getDataLayoutString(const Triple &T) {
|
|
|
|
bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le;
|
|
|
|
std::string Ret;
|
|
|
|
|
|
|
|
// Most PPC* platforms are big endian, PPC64LE is little endian.
|
|
|
|
if (T.getArch() == Triple::ppc64le)
|
|
|
|
Ret = "e";
|
|
|
|
else
|
|
|
|
Ret = "E";
|
|
|
|
|
|
|
|
Ret += DataLayout::getManglingComponent(T);
|
|
|
|
|
|
|
|
// PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
|
|
|
|
// pointers.
|
|
|
|
if (!is64Bit || T.getOS() == Triple::Lv2)
|
|
|
|
Ret += "-p:32:32";
|
|
|
|
|
|
|
|
// Note, the alignment values for f64 and i64 on ppc64 in Darwin
|
|
|
|
// documentation are wrong; these are correct (i.e. "what gcc does").
|
|
|
|
if (is64Bit || !T.isOSDarwin())
|
|
|
|
Ret += "-i64:64";
|
|
|
|
else
|
|
|
|
Ret += "-f64:32:64";
|
|
|
|
|
|
|
|
// PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
|
|
|
|
if (is64Bit)
|
|
|
|
Ret += "-n32:64";
|
|
|
|
else
|
|
|
|
Ret += "-n32";
|
|
|
|
|
|
|
|
return Ret;
|
|
|
|
}
|
|
|
|
|
2015-06-16 21:15:50 +08:00
|
|
|
static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL,
|
|
|
|
const Triple &TT) {
|
2014-10-02 04:38:26 +08:00
|
|
|
std::string FullFS = FS;
|
|
|
|
|
|
|
|
// Make sure 64-bit features are available when CPUname is generic
|
2015-06-16 21:15:50 +08:00
|
|
|
if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) {
|
2014-10-02 04:38:26 +08:00
|
|
|
if (!FullFS.empty())
|
|
|
|
FullFS = "+64bit," + FullFS;
|
|
|
|
else
|
|
|
|
FullFS = "+64bit";
|
|
|
|
}
|
|
|
|
|
|
|
|
if (OL >= CodeGenOpt::Default) {
|
|
|
|
if (!FullFS.empty())
|
|
|
|
FullFS = "+crbits," + FullFS;
|
|
|
|
else
|
|
|
|
FullFS = "+crbits";
|
|
|
|
}
|
[PowerPC] Loosen ELFv1 PPC64 func descriptor loads for indirect calls
Function pointers under PPC64 ELFv1 (which is used on PPC64/Linux on the
POWER7, A2 and earlier cores) are really pointers to a function descriptor, a
structure with three pointers: the actual pointer to the code to which to jump,
the pointer to the TOC needed by the callee, and an environment pointer. We
used to chain these loads, and make them opaque to the rest of the optimizer,
so that they'd always occur directly before the call. This is not necessary,
and in fact, highly suboptimal on embedded cores. Once the function pointer is
known, the loads can be performed ahead of time; in fact, they can be hoisted
out of loops.
Now these function descriptors are almost always generated by the linker, and
thus the contents of the descriptors are invariant. As a result, by default,
we'll mark the associated loads as invariant (allowing them to be hoisted out
of loops). I've added a target feature to turn this off, however, just in case
someone needs that option (constructing an on-stack descriptor, casting it to a
function pointer, and then calling it cannot be well-defined C/C++ code, but I
can imagine some JIT-compilation system doing so).
Consider this simple test:
$ cat call.c
typedef void (*fp)();
void bar(fp x) {
for (int i = 0; i < 1600000000; ++i)
x();
}
$ cat main.c
typedef void (*fp)();
void bar(fp x);
void foo() {}
int main() {
bar(foo);
}
On the PPC A2 (the BG/Q supercomputer), marking the function-descriptor loads
as invariant brings the execution time down to ~8 seconds from ~32 seconds with
the loads in the loop.
The difference on the POWER7 is smaller. Compiling with:
gcc -std=c99 -O3 -mcpu=native call.c main.c : ~6 seconds [this is 4.8.2]
clang -O3 -mcpu=native call.c main.c : ~5.3 seconds
clang -O3 -mcpu=native call.c main.c -mno-invariant-function-descriptors : ~4 seconds
(looks like we'd benefit from additional loop unrolling here, as a first
guess, because this is faster with the extra loads)
The -mno-invariant-function-descriptors will be added to Clang shortly.
llvm-svn: 226207
2015-01-16 05:17:34 +08:00
|
|
|
|
|
|
|
if (OL != CodeGenOpt::None) {
|
2015-09-22 19:13:55 +08:00
|
|
|
if (!FullFS.empty())
|
[PowerPC] Loosen ELFv1 PPC64 func descriptor loads for indirect calls
Function pointers under PPC64 ELFv1 (which is used on PPC64/Linux on the
POWER7, A2 and earlier cores) are really pointers to a function descriptor, a
structure with three pointers: the actual pointer to the code to which to jump,
the pointer to the TOC needed by the callee, and an environment pointer. We
used to chain these loads, and make them opaque to the rest of the optimizer,
so that they'd always occur directly before the call. This is not necessary,
and in fact, highly suboptimal on embedded cores. Once the function pointer is
known, the loads can be performed ahead of time; in fact, they can be hoisted
out of loops.
Now these function descriptors are almost always generated by the linker, and
thus the contents of the descriptors are invariant. As a result, by default,
we'll mark the associated loads as invariant (allowing them to be hoisted out
of loops). I've added a target feature to turn this off, however, just in case
someone needs that option (constructing an on-stack descriptor, casting it to a
function pointer, and then calling it cannot be well-defined C/C++ code, but I
can imagine some JIT-compilation system doing so).
Consider this simple test:
$ cat call.c
typedef void (*fp)();
void bar(fp x) {
for (int i = 0; i < 1600000000; ++i)
x();
}
$ cat main.c
typedef void (*fp)();
void bar(fp x);
void foo() {}
int main() {
bar(foo);
}
On the PPC A2 (the BG/Q supercomputer), marking the function-descriptor loads
as invariant brings the execution time down to ~8 seconds from ~32 seconds with
the loads in the loop.
The difference on the POWER7 is smaller. Compiling with:
gcc -std=c99 -O3 -mcpu=native call.c main.c : ~6 seconds [this is 4.8.2]
clang -O3 -mcpu=native call.c main.c : ~5.3 seconds
clang -O3 -mcpu=native call.c main.c -mno-invariant-function-descriptors : ~4 seconds
(looks like we'd benefit from additional loop unrolling here, as a first
guess, because this is faster with the extra loads)
The -mno-invariant-function-descriptors will be added to Clang shortly.
llvm-svn: 226207
2015-01-16 05:17:34 +08:00
|
|
|
FullFS = "+invariant-function-descriptors," + FullFS;
|
|
|
|
else
|
|
|
|
FullFS = "+invariant-function-descriptors";
|
|
|
|
}
|
|
|
|
|
2014-10-02 04:38:26 +08:00
|
|
|
return FullFS;
|
|
|
|
}
|
|
|
|
|
2014-11-13 17:26:31 +08:00
|
|
|
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
|
|
|
|
// If it isn't a Mach-O file then it's going to be a linux ELF
|
|
|
|
// object file.
|
|
|
|
if (TT.isOSDarwin())
|
|
|
|
return make_unique<TargetLoweringObjectFileMachO>();
|
|
|
|
|
|
|
|
return make_unique<PPC64LinuxTargetObjectFile>();
|
|
|
|
}
|
|
|
|
|
2015-02-17 14:45:15 +08:00
|
|
|
static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
|
|
|
|
const TargetOptions &Options) {
|
|
|
|
if (Options.MCOptions.getABIName().startswith("elfv1"))
|
|
|
|
return PPCTargetMachine::PPC_ABI_ELFv1;
|
|
|
|
else if (Options.MCOptions.getABIName().startswith("elfv2"))
|
|
|
|
return PPCTargetMachine::PPC_ABI_ELFv2;
|
|
|
|
|
|
|
|
assert(Options.MCOptions.getABIName().empty() &&
|
2015-09-22 19:15:07 +08:00
|
|
|
"Unknown target-abi option!");
|
2015-02-17 14:45:15 +08:00
|
|
|
|
|
|
|
if (!TT.isMacOSX()) {
|
|
|
|
switch (TT.getArch()) {
|
|
|
|
case Triple::ppc64le:
|
|
|
|
return PPCTargetMachine::PPC_ABI_ELFv2;
|
|
|
|
case Triple::ppc64:
|
|
|
|
return PPCTargetMachine::PPC_ABI_ELFv1;
|
|
|
|
default:
|
|
|
|
// Fallthrough.
|
|
|
|
;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return PPCTargetMachine::PPC_ABI_UNKNOWN;
|
|
|
|
}
|
|
|
|
|
2015-09-22 19:14:12 +08:00
|
|
|
// The FeatureString here is a little subtle. We are modifying the feature
|
|
|
|
// string with what are (currently) non-function specific overrides as it goes
|
|
|
|
// into the LLVMTargetMachine constructor and then using the stored value in the
|
2014-10-02 04:38:26 +08:00
|
|
|
// Subtarget constructor below it.
|
2015-06-12 03:41:26 +08:00
|
|
|
PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
|
|
|
|
StringRef CPU, StringRef FS,
|
|
|
|
const TargetOptions &Options,
|
2011-07-20 15:51:56 +08:00
|
|
|
Reloc::Model RM, CodeModel::Model CM,
|
2014-08-09 12:38:56 +08:00
|
|
|
CodeGenOpt::Level OL)
|
2015-06-12 03:41:26 +08:00
|
|
|
: LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU,
|
2015-06-16 21:15:50 +08:00
|
|
|
computeFSAdditions(FS, OL, TT), Options, RM, CM, OL),
|
2015-06-16 23:44:21 +08:00
|
|
|
TLOF(createTLOF(getTargetTriple())),
|
2015-07-12 10:33:57 +08:00
|
|
|
TargetABI(computeTargetABI(TT, Options)),
|
|
|
|
Subtarget(TargetTriple, CPU, computeFSAdditions(FS, OL, TT), *this) {
|
|
|
|
|
|
|
|
// For the estimates, convergence is quadratic, so we essentially double the
|
|
|
|
// number of digits correct after every iteration. For both FRE and FRSQRTE,
|
|
|
|
// the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
|
|
|
|
// this is 2^-14. IEEE float has 23 digits and double has 52 digits.
|
|
|
|
unsigned RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3,
|
|
|
|
RefinementSteps64 = RefinementSteps + 1;
|
|
|
|
|
|
|
|
this->Options.Reciprocals.setDefaults("sqrtf", true, RefinementSteps);
|
|
|
|
this->Options.Reciprocals.setDefaults("vec-sqrtf", true, RefinementSteps);
|
|
|
|
this->Options.Reciprocals.setDefaults("divf", true, RefinementSteps);
|
|
|
|
this->Options.Reciprocals.setDefaults("vec-divf", true, RefinementSteps);
|
|
|
|
|
|
|
|
this->Options.Reciprocals.setDefaults("sqrtd", true, RefinementSteps64);
|
|
|
|
this->Options.Reciprocals.setDefaults("vec-sqrtd", true, RefinementSteps64);
|
|
|
|
this->Options.Reciprocals.setDefaults("divd", true, RefinementSteps64);
|
|
|
|
this->Options.Reciprocals.setDefaults("vec-divd", true, RefinementSteps64);
|
|
|
|
|
2013-05-13 09:16:13 +08:00
|
|
|
initAsmInfo();
|
2005-10-16 13:39:50 +08:00
|
|
|
}
|
|
|
|
|
2014-11-21 07:37:18 +08:00
|
|
|
PPCTargetMachine::~PPCTargetMachine() {}
|
|
|
|
|
2011-12-20 10:50:00 +08:00
|
|
|
void PPC32TargetMachine::anchor() { }
|
|
|
|
|
2015-06-12 03:41:26 +08:00
|
|
|
PPC32TargetMachine::PPC32TargetMachine(const Target &T, const Triple &TT,
|
2011-07-20 15:51:56 +08:00
|
|
|
StringRef CPU, StringRef FS,
|
2011-12-03 06:16:29 +08:00
|
|
|
const TargetOptions &Options,
|
2011-11-16 16:38:26 +08:00
|
|
|
Reloc::Model RM, CodeModel::Model CM,
|
|
|
|
CodeGenOpt::Level OL)
|
2015-06-12 03:41:26 +08:00
|
|
|
: PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
|
2006-06-16 09:37:27 +08:00
|
|
|
|
2011-12-20 10:50:00 +08:00
|
|
|
void PPC64TargetMachine::anchor() { }
|
2006-06-16 09:37:27 +08:00
|
|
|
|
2015-06-12 03:41:26 +08:00
|
|
|
PPC64TargetMachine::PPC64TargetMachine(const Target &T, const Triple &TT,
|
|
|
|
StringRef CPU, StringRef FS,
|
2011-12-03 06:16:29 +08:00
|
|
|
const TargetOptions &Options,
|
2011-11-16 16:38:26 +08:00
|
|
|
Reloc::Model RM, CodeModel::Model CM,
|
|
|
|
CodeGenOpt::Level OL)
|
2015-06-12 03:41:26 +08:00
|
|
|
: PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
|
2006-06-16 09:37:27 +08:00
|
|
|
|
2014-10-06 14:45:36 +08:00
|
|
|
const PPCSubtarget *
|
|
|
|
PPCTargetMachine::getSubtargetImpl(const Function &F) const {
|
2015-02-14 10:54:07 +08:00
|
|
|
Attribute CPUAttr = F.getFnAttribute("target-cpu");
|
|
|
|
Attribute FSAttr = F.getFnAttribute("target-features");
|
2014-10-06 14:45:36 +08:00
|
|
|
|
|
|
|
std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
|
|
|
|
? CPUAttr.getValueAsString().str()
|
|
|
|
: TargetCPU;
|
|
|
|
std::string FS = !FSAttr.hasAttribute(Attribute::None)
|
|
|
|
? FSAttr.getValueAsString().str()
|
|
|
|
: TargetFS;
|
|
|
|
|
2015-12-15 01:57:33 +08:00
|
|
|
// FIXME: This is related to the code below to reset the target options,
|
|
|
|
// we need to know whether or not the soft float flag is set on the
|
|
|
|
// function before we can generate a subtarget. We also need to use
|
|
|
|
// it as a key for the subtarget since that can be the only difference
|
|
|
|
// between two functions.
|
|
|
|
bool SoftFloat =
|
2016-03-30 23:41:12 +08:00
|
|
|
F.getFnAttribute("use-soft-float").getValueAsString() == "true";
|
2015-12-15 01:57:33 +08:00
|
|
|
// If the soft float attribute is set on the function turn on the soft float
|
|
|
|
// subtarget feature.
|
|
|
|
if (SoftFloat)
|
|
|
|
FS += FS.empty() ? "+soft-float" : ",+soft-float";
|
|
|
|
|
2014-10-06 14:45:36 +08:00
|
|
|
auto &I = SubtargetMap[CPU + FS];
|
|
|
|
if (!I) {
|
|
|
|
// This needs to be done before we create a new subtarget since any
|
|
|
|
// creation will depend on the TM and the code generation flags on the
|
|
|
|
// function that reside in TargetOptions.
|
|
|
|
resetTargetOptions(F);
|
2015-03-26 08:50:23 +08:00
|
|
|
I = llvm::make_unique<PPCSubtarget>(
|
2015-06-16 23:44:21 +08:00
|
|
|
TargetTriple, CPU,
|
2015-03-26 08:50:23 +08:00
|
|
|
// FIXME: It would be good to have the subtarget additions here
|
|
|
|
// not necessary. Anything that turns them on/off (overrides) ends
|
|
|
|
// up being put at the end of the feature string, but the defaults
|
|
|
|
// shouldn't require adding them. Fixing this means pulling Feature64Bit
|
|
|
|
// out of most of the target cpus in the .td file and making it set only
|
|
|
|
// as part of initialization via the TargetTriple.
|
|
|
|
computeFSAdditions(FS, getOptLevel(), getTargetTriple()), *this);
|
2014-10-06 14:45:36 +08:00
|
|
|
}
|
|
|
|
return I.get();
|
|
|
|
}
|
2004-08-11 15:40:04 +08:00
|
|
|
|
2006-09-04 12:14:57 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Pass Pipeline Configuration
|
|
|
|
//===----------------------------------------------------------------------===//
|
2004-08-11 15:40:04 +08:00
|
|
|
|
2012-02-03 13:12:41 +08:00
|
|
|
namespace {
|
|
|
|
/// PPC Code Generator Pass Configuration Options.
|
|
|
|
class PPCPassConfig : public TargetPassConfig {
|
|
|
|
public:
|
2012-02-04 10:56:59 +08:00
|
|
|
PPCPassConfig(PPCTargetMachine *TM, PassManagerBase &PM)
|
|
|
|
: TargetPassConfig(TM, PM) {}
|
2012-02-03 13:12:41 +08:00
|
|
|
|
|
|
|
PPCTargetMachine &getPPCTargetMachine() const {
|
|
|
|
return getTM<PPCTargetMachine>();
|
|
|
|
}
|
|
|
|
|
2014-09-24 04:46:49 +08:00
|
|
|
void addIRPasses() override;
|
2014-04-29 15:57:37 +08:00
|
|
|
bool addPreISel() override;
|
|
|
|
bool addILPOpts() override;
|
|
|
|
bool addInstSelector() override;
|
[PPC64LE] Remove unnecessary swaps from lane-insensitive vector computations
This patch adds a new SSA MI pass that runs on little-endian PPC64
code with VSX enabled. Loads and stores of 4x32 and 2x64 vectors
without alignment constraints are accomplished for little-endian using
lxvd2x/xxswapd and xxswapd/stxvd2x. The existence of the additional
xxswapd instructions hurts performance in comparison with big-endian
code, but they are necessary in the general case to support correct
semantics.
However, the general case does not apply to most vector code. Many
vector instructions are lane-insensitive; they do not "care" which
lanes the parallel computations are performed within, provided that
the resulting data is stored into the correct locations. Thus this
pass looks for computations that perform only lane-insensitive
operations, and remove the unnecessary swaps from loads and stores in
such computations.
Future improvements will allow computations using certain
lane-sensitive operations to also be optimized in this manner, by
modifying the lane-sensitive operations to account for the permuted
order of the lanes. However, this patch only adds the infrastructure
to permit this; no lane-sensitive operations are optimized at this
time.
This code is heavily exercised by the various vectorizing applications
in the projects/test-suite tree. For the time being, I have only added
one simple test case to demonstrate what the pass is doing. Although
it is quite simple, it provides coverage for much of the code,
including the special case handling of copies and subreg-to-reg
operations feeding the swaps. I plan to add additional tests in the
future as I fill in more of the "special handling" code.
Two existing tests were affected, because they expected the swaps to
be present, but they are now removed.
llvm-svn: 235910
2015-04-28 03:57:34 +08:00
|
|
|
void addMachineSSAOptimization() override;
|
2014-12-12 05:26:47 +08:00
|
|
|
void addPreRegAlloc() override;
|
|
|
|
void addPreSched2() override;
|
|
|
|
void addPreEmitPass() override;
|
2012-02-03 13:12:41 +08:00
|
|
|
};
|
|
|
|
} // namespace
|
|
|
|
|
2012-02-04 10:56:59 +08:00
|
|
|
TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
|
2012-06-09 11:14:50 +08:00
|
|
|
return new PPCPassConfig(this, PM);
|
2012-02-03 13:12:41 +08:00
|
|
|
}
|
|
|
|
|
2014-09-24 04:46:49 +08:00
|
|
|
void PPCPassConfig::addIRPasses() {
|
2015-12-08 04:50:29 +08:00
|
|
|
if (TM->getOptLevel() != CodeGenOpt::None)
|
|
|
|
addPass(createPPCBoolRetToIntPass());
|
2014-09-24 04:46:49 +08:00
|
|
|
addPass(createAtomicExpandPass(&getPPCTargetMachine()));
|
2014-11-21 12:35:51 +08:00
|
|
|
|
[PowerPC] Loop Data Prefetching for the BG/Q
The IBM BG/Q supercomputer's A2 cores have a hardware prefetching unit, the
L1P, but it does not prefetch directly into the A2's L1 cache. Instead, it
prefetches into its own L1P buffer, and the latency to access that buffer is
significantly higher than that to the L1 cache (although smaller than the
latency to the L2 cache). As a result, especially when multiple hardware
threads are not actively busy, explicitly prefetching data into the L1 cache is
advantageous.
I've been using this pass out-of-tree for data prefetching on the BG/Q for well
over a year, and it has worked quite well. It is enabled by default only for
the BG/Q, but can be enabled for other cores as well via a command-line option.
Eventually, we might want to add some TTI interfaces and move this into
Transforms/Scalar (there is nothing particularly target dependent about it,
although only machines like the BG/Q will benefit from its simplistic
strategy).
llvm-svn: 229966
2015-02-20 13:08:21 +08:00
|
|
|
// For the BG/Q (or if explicitly requested), add explicit data prefetch
|
|
|
|
// intrinsics.
|
2015-06-16 23:44:21 +08:00
|
|
|
bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ &&
|
|
|
|
getOptLevel() != CodeGenOpt::None;
|
[PowerPC] Loop Data Prefetching for the BG/Q
The IBM BG/Q supercomputer's A2 cores have a hardware prefetching unit, the
L1P, but it does not prefetch directly into the A2's L1 cache. Instead, it
prefetches into its own L1P buffer, and the latency to access that buffer is
significantly higher than that to the L1 cache (although smaller than the
latency to the L2 cache). As a result, especially when multiple hardware
threads are not actively busy, explicitly prefetching data into the L1 cache is
advantageous.
I've been using this pass out-of-tree for data prefetching on the BG/Q for well
over a year, and it has worked quite well. It is enabled by default only for
the BG/Q, but can be enabled for other cores as well via a command-line option.
Eventually, we might want to add some TTI interfaces and move this into
Transforms/Scalar (there is nothing particularly target dependent about it,
although only machines like the BG/Q will benefit from its simplistic
strategy).
llvm-svn: 229966
2015-02-20 13:08:21 +08:00
|
|
|
if (EnablePrefetch.getNumOccurrences() > 0)
|
|
|
|
UsePrefetching = EnablePrefetch;
|
|
|
|
if (UsePrefetching)
|
2016-02-19 05:38:19 +08:00
|
|
|
addPass(createLoopDataPrefetchPass());
|
[PowerPC] Loop Data Prefetching for the BG/Q
The IBM BG/Q supercomputer's A2 cores have a hardware prefetching unit, the
L1P, but it does not prefetch directly into the A2's L1 cache. Instead, it
prefetches into its own L1P buffer, and the latency to access that buffer is
significantly higher than that to the L1 cache (although smaller than the
latency to the L2 cache). As a result, especially when multiple hardware
threads are not actively busy, explicitly prefetching data into the L1 cache is
advantageous.
I've been using this pass out-of-tree for data prefetching on the BG/Q for well
over a year, and it has worked quite well. It is enabled by default only for
the BG/Q, but can be enabled for other cores as well via a command-line option.
Eventually, we might want to add some TTI interfaces and move this into
Transforms/Scalar (there is nothing particularly target dependent about it,
although only machines like the BG/Q will benefit from its simplistic
strategy).
llvm-svn: 229966
2015-02-20 13:08:21 +08:00
|
|
|
|
2016-04-07 23:30:55 +08:00
|
|
|
if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) {
|
2014-11-21 12:35:51 +08:00
|
|
|
// Call SeparateConstOffsetFromGEP pass to extract constants within indices
|
|
|
|
// and lower a GEP with multiple indices to either arithmetic operations or
|
|
|
|
// multiple GEPs with single index.
|
|
|
|
addPass(createSeparateConstOffsetFromGEPPass(TM, true));
|
|
|
|
// Call EarlyCSE pass to find and remove subexpressions in the lowered
|
|
|
|
// result.
|
|
|
|
addPass(createEarlyCSEPass());
|
|
|
|
// Do loop invariant code motion in case part of the lowered result is
|
|
|
|
// invariant.
|
|
|
|
addPass(createLICMPass());
|
|
|
|
}
|
|
|
|
|
2014-09-24 04:46:49 +08:00
|
|
|
TargetPassConfig::addIRPasses();
|
|
|
|
}
|
|
|
|
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
llvm-svn: 181927
2013-05-16 05:37:41 +08:00
|
|
|
bool PPCPassConfig::addPreISel() {
|
[PowerPC] Prepare loops for pre-increment loads/stores
PowerPC supports pre-increment load/store instructions (except for Altivec/VSX
vector load/stores). Using these on embedded cores can be very important, but
most loops are not naturally set up to use them. We can often change that,
however, by placing loops into a non-canonical form. Generically, this means
transforming loops like this:
for (int i = 0; i < n; ++i)
array[i] = c;
to look like this:
T *p = array[-1];
for (int i = 0; i < n; ++i)
*++p = c;
the key point is that addresses accessed are pulled into dedicated PHIs and
"pre-decremented" in the loop preheader. This allows the use of pre-increment
load/store instructions without loop peeling.
A target-specific late IR-level pass (running post-LSR), PPCLoopPreIncPrep, is
introduced to perform this transformation. I've used this code out-of-tree for
generating code for the PPC A2 for over a year. Somewhat to my surprise,
running the test suite + externals on a P7 with this transformation enabled
showed no performance regressions, and one speedup:
External/SPEC/CINT2006/483.xalancbmk/483.xalancbmk
-2.32514% +/- 1.03736%
So I'm going to enable it on everything for now. I was surprised by this
because, on the POWER cores, these pre-increment load/store instructions are
cracked (and, thus, harder to schedule effectively). But seeing no regressions,
and feeling that it is generally easier to split instructions apart late than
it is to combine them late, this might be the better approach regardless.
In the future, we might want to integrate this functionality into LSR (but
currently LSR does not create new PHI nodes, so (for that and other reasons)
significant work would need to be done).
llvm-svn: 228328
2015-02-06 02:43:00 +08:00
|
|
|
if (!DisablePreIncPrep && getOptLevel() != CodeGenOpt::None)
|
|
|
|
addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));
|
|
|
|
|
2012-06-09 03:19:53 +08:00
|
|
|
if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
llvm-svn: 181927
2013-05-16 05:37:41 +08:00
|
|
|
addPass(createPPCCTRLoops(getPPCTargetMachine()));
|
2012-06-08 23:38:21 +08:00
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2013-04-06 07:29:01 +08:00
|
|
|
bool PPCPassConfig::addILPOpts() {
|
2014-05-22 07:40:26 +08:00
|
|
|
addPass(&EarlyIfConverterID);
|
2015-07-15 16:23:05 +08:00
|
|
|
|
|
|
|
if (EnableMachineCombinerPass)
|
|
|
|
addPass(&MachineCombinerID);
|
|
|
|
|
2014-05-22 07:40:26 +08:00
|
|
|
return true;
|
2013-04-06 07:29:01 +08:00
|
|
|
}
|
|
|
|
|
2012-02-03 13:12:41 +08:00
|
|
|
bool PPCPassConfig::addInstSelector() {
|
2005-08-18 03:33:30 +08:00
|
|
|
// Install an instruction selector.
|
2012-07-03 03:48:31 +08:00
|
|
|
addPass(createPPCISelDag(getPPCTargetMachine()));
|
Add a PPCCTRLoops verification pass
When asserts are enabled, this adds a verification pass for PPC counter-loop
formation. Unfortunately, without sacrificing code quality, there is no better
way of forming counter-based loops except at the (late) IR level. This means
that we need to recognize, at the IR level, anything which might turn into a
function call (or indirect branch). Because this is currently a finite set of
things, and because SelectionDAG lowering is basic-block local, this can be
done. Nevertheless, it is fragile, and failure results in a miscompile. This
verification pass checks that all (reachable) counter-based branches are
dominated by a loop mtctr instruction, and that no instructions in between
clobber the counter register. If these conditions are not satisfied, then an
ICE will be triggered.
In short, this is to help us sleep better at night.
llvm-svn: 182295
2013-05-21 00:08:17 +08:00
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
|
|
|
|
addPass(createPPCCTRLoopsVerify());
|
|
|
|
#endif
|
|
|
|
|
2014-05-22 09:21:35 +08:00
|
|
|
addPass(createPPCVSXCopyPass());
|
2006-09-04 12:14:57 +08:00
|
|
|
return false;
|
|
|
|
}
|
2004-08-11 15:40:04 +08:00
|
|
|
|
[PPC64LE] Remove unnecessary swaps from lane-insensitive vector computations
This patch adds a new SSA MI pass that runs on little-endian PPC64
code with VSX enabled. Loads and stores of 4x32 and 2x64 vectors
without alignment constraints are accomplished for little-endian using
lxvd2x/xxswapd and xxswapd/stxvd2x. The existence of the additional
xxswapd instructions hurts performance in comparison with big-endian
code, but they are necessary in the general case to support correct
semantics.
However, the general case does not apply to most vector code. Many
vector instructions are lane-insensitive; they do not "care" which
lanes the parallel computations are performed within, provided that
the resulting data is stored into the correct locations. Thus this
pass looks for computations that perform only lane-insensitive
operations, and remove the unnecessary swaps from loads and stores in
such computations.
Future improvements will allow computations using certain
lane-sensitive operations to also be optimized in this manner, by
modifying the lane-sensitive operations to account for the permuted
order of the lanes. However, this patch only adds the infrastructure
to permit this; no lane-sensitive operations are optimized at this
time.
This code is heavily exercised by the various vectorizing applications
in the projects/test-suite tree. For the time being, I have only added
one simple test case to demonstrate what the pass is doing. Although
it is quite simple, it provides coverage for much of the code,
including the special case handling of copies and subreg-to-reg
operations feeding the swaps. I plan to add additional tests in the
future as I fill in more of the "special handling" code.
Two existing tests were affected, because they expected the swaps to
be present, but they are now removed.
llvm-svn: 235910
2015-04-28 03:57:34 +08:00
|
|
|
void PPCPassConfig::addMachineSSAOptimization() {
|
|
|
|
TargetPassConfig::addMachineSSAOptimization();
|
|
|
|
// For little endian, remove where possible the vector swap instructions
|
|
|
|
// introduced at code generation to normalize vector element order.
|
2015-06-16 23:44:21 +08:00
|
|
|
if (TM->getTargetTriple().getArch() == Triple::ppc64le &&
|
[PPC64LE] Remove unnecessary swaps from lane-insensitive vector computations
This patch adds a new SSA MI pass that runs on little-endian PPC64
code with VSX enabled. Loads and stores of 4x32 and 2x64 vectors
without alignment constraints are accomplished for little-endian using
lxvd2x/xxswapd and xxswapd/stxvd2x. The existence of the additional
xxswapd instructions hurts performance in comparison with big-endian
code, but they are necessary in the general case to support correct
semantics.
However, the general case does not apply to most vector code. Many
vector instructions are lane-insensitive; they do not "care" which
lanes the parallel computations are performed within, provided that
the resulting data is stored into the correct locations. Thus this
pass looks for computations that perform only lane-insensitive
operations, and remove the unnecessary swaps from loads and stores in
such computations.
Future improvements will allow computations using certain
lane-sensitive operations to also be optimized in this manner, by
modifying the lane-sensitive operations to account for the permuted
order of the lanes. However, this patch only adds the infrastructure
to permit this; no lane-sensitive operations are optimized at this
time.
This code is heavily exercised by the various vectorizing applications
in the projects/test-suite tree. For the time being, I have only added
one simple test case to demonstrate what the pass is doing. Although
it is quite simple, it provides coverage for much of the code,
including the special case handling of copies and subreg-to-reg
operations feeding the swaps. I plan to add additional tests in the
future as I fill in more of the "special handling" code.
Two existing tests were affected, because they expected the swaps to
be present, but they are now removed.
llvm-svn: 235910
2015-04-28 03:57:34 +08:00
|
|
|
!DisableVSXSwapRemoval)
|
|
|
|
addPass(createPPCVSXSwapRemovalPass());
|
2015-11-11 05:38:26 +08:00
|
|
|
// Target-specific peephole cleanups performed after instruction
|
|
|
|
// selection.
|
|
|
|
if (!DisableMIPeephole) {
|
|
|
|
addPass(createPPCMIPeepholePass());
|
|
|
|
addPass(&DeadMachineInstructionElimID);
|
|
|
|
}
|
[PPC64LE] Remove unnecessary swaps from lane-insensitive vector computations
This patch adds a new SSA MI pass that runs on little-endian PPC64
code with VSX enabled. Loads and stores of 4x32 and 2x64 vectors
without alignment constraints are accomplished for little-endian using
lxvd2x/xxswapd and xxswapd/stxvd2x. The existence of the additional
xxswapd instructions hurts performance in comparison with big-endian
code, but they are necessary in the general case to support correct
semantics.
However, the general case does not apply to most vector code. Many
vector instructions are lane-insensitive; they do not "care" which
lanes the parallel computations are performed within, provided that
the resulting data is stored into the correct locations. Thus this
pass looks for computations that perform only lane-insensitive
operations, and remove the unnecessary swaps from loads and stores in
such computations.
Future improvements will allow computations using certain
lane-sensitive operations to also be optimized in this manner, by
modifying the lane-sensitive operations to account for the permuted
order of the lanes. However, this patch only adds the infrastructure
to permit this; no lane-sensitive operations are optimized at this
time.
This code is heavily exercised by the various vectorizing applications
in the projects/test-suite tree. For the time being, I have only added
one simple test case to demonstrate what the pass is doing. Although
it is quite simple, it provides coverage for much of the code,
including the special case handling of copies and subreg-to-reg
operations feeding the swaps. I plan to add additional tests in the
future as I fill in more of the "special handling" code.
Two existing tests were affected, because they expected the swaps to
be present, but they are now removed.
llvm-svn: 235910
2015-04-28 03:57:34 +08:00
|
|
|
}
|
|
|
|
|
2014-12-12 05:26:47 +08:00
|
|
|
void PPCPassConfig::addPreRegAlloc() {
|
2016-04-28 03:39:32 +08:00
|
|
|
if (getOptLevel() != CodeGenOpt::None) {
|
|
|
|
initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
|
|
|
|
insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
|
|
|
|
&PPCVSXFMAMutateID);
|
|
|
|
}
|
2016-04-29 07:42:51 +08:00
|
|
|
if (getPPCTargetMachine().getRelocationModel() == Reloc::PIC_) {
|
|
|
|
// FIXME: LiveVariables should not be necessary here!
|
|
|
|
// PPCTLSDYnamicCallPass uses LiveIntervals which previously dependet on
|
|
|
|
// LiveVariables. This (unnecessary) dependency has been removed now,
|
|
|
|
// however a stage-2 clang build fails without LiveVariables computed here.
|
|
|
|
addPass(&LiveVariablesID, false);
|
2015-02-11 03:09:05 +08:00
|
|
|
addPass(createPPCTLSDynamicCallPass());
|
2016-04-29 07:42:51 +08:00
|
|
|
}
|
[PowerPC] Add extra r2 read deps on @toc@l relocations
If some commits are happy, and some commits are sad, this is a sad commit. It
is sad because it restricts instruction scheduling to work around a binutils
linker bug, and moreover, one that may never be fixed. On 2012-05-21, GCC was
updated not to produce code triggering this bug, and now we'll do the same...
When resolving an address using the ELF ABI TOC pointer, two relocations are
generally required: one for the high part and one for the low part. Only
the high part generally explicitly depends on r2 (the TOC pointer). And, so,
we might produce code like this:
.Ltmp526:
addis 3, 2, .LC12@toc@ha
.Ltmp1628:
std 2, 40(1)
ld 5, 0(27)
ld 2, 8(27)
ld 11, 16(27)
ld 3, .LC12@toc@l(3)
rldicl 4, 4, 0, 32
mtctr 5
bctrl
ld 2, 40(1)
And there is nothing wrong with this code, as such, but there is a linker bug
in binutils (https://sourceware.org/bugzilla/show_bug.cgi?id=18414) that will
misoptimize this code sequence to this:
nop
std r2,40(r1)
ld r5,0(r27)
ld r2,8(r27)
ld r11,16(r27)
ld r3,-32472(r2)
clrldi r4,r4,32
mtctr r5
bctrl
ld r2,40(r1)
because the linker does not know (and does not check) that the value in r2
changed in between the instruction using the .LC12@toc@ha (TOC-relative)
relocation and the instruction using the .LC12@toc@l(3) relocation.
Because it finds these instructions using the relocations (and not by
scanning the instructions), it has been asserted that there is no good way
to detect the change of r2 in between. As a result, this bug may never be
fixed (i.e. it may become part of the definition of the ABI). GCC was
updated to add extra dependencies on r2 to instructions using the @toc@l
relocations to avoid this problem, and we'll do the same here.
This is done as a separate pass because:
1. These extra r2 dependencies are not really properties of the
instructions, but rather due to a linker bug, and maybe one day we'll be
able to get rid of them when targeting linkers without this bug (and,
thus, keeping the logic centralized here will make that
straightforward).
2. There are ISel-level peephole optimizations that propagate the @toc@l
relocations to some user instructions, and so the exta dependencies do
not apply only to a fixed set of instructions (without undesirable
definition replication).
The test case was reduced with the help of bugpoint, with minimal cleaning. I'm
looking forward to our upcoming MI serialization support, and with that, much
better tests can be created.
llvm-svn: 237556
2015-05-18 14:25:59 +08:00
|
|
|
if (EnableExtraTOCRegDeps)
|
|
|
|
addPass(createPPCTOCRegDepsPass());
|
[PowerPC] Select between VSX A-type and M-type FMA instructions just before RA
The VSX instruction set has two types of FMA instructions: A-type (where the
addend is taken from the output register) and M-type (where one of the product
operands is taken from the output register). This adds a small pass that runs
just after MI scheduling (and, thus, just before register allocation) that
mutates A-type instructions (that are created during isel) into M-type
instructions when:
1. This will eliminate an otherwise-necessary copy of the addend
2. One of the product operands is killed by the instruction
The "right" moment to make this decision is in between scheduling and register
allocation, because only there do we know whether or not one of the product
operands is killed by any particular instruction. Unfortunately, this also
makes the implementation somewhat complicated, because the MIs are not in SSA
form and we need to preserve the LiveIntervals analysis.
As a simple example, if we have:
%vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
%RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
...
%vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
%RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
...
We can eliminate the copy by changing from the A-type to the
M-type instruction. This means:
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
%RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
is replaced by:
%vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
%RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
llvm-svn: 204768
2014-03-26 07:29:21 +08:00
|
|
|
}
|
|
|
|
|
2014-12-12 05:26:47 +08:00
|
|
|
void PPCPassConfig::addPreSched2() {
|
2016-04-01 04:39:41 +08:00
|
|
|
if (getOptLevel() != CodeGenOpt::None) {
|
2013-04-10 06:58:37 +08:00
|
|
|
addPass(&IfConverterID);
|
2016-04-01 04:39:41 +08:00
|
|
|
|
|
|
|
// This optimization must happen after anything that might do store-to-load
|
|
|
|
// forwarding. Here we're after RA (and, thus, when spills are inserted)
|
|
|
|
// but before post-RA scheduling.
|
|
|
|
if (!DisableQPXLoadSplat)
|
|
|
|
addPass(createPPCQPXLoadSplatPass());
|
|
|
|
}
|
2013-04-10 06:58:37 +08:00
|
|
|
}
|
|
|
|
|
2014-12-12 05:26:47 +08:00
|
|
|
void PPCPassConfig::addPreEmitPass() {
|
2013-04-09 00:24:03 +08:00
|
|
|
if (getOptLevel() != CodeGenOpt::None)
|
2014-12-12 05:26:47 +08:00
|
|
|
addPass(createPPCEarlyReturnPass(), false);
|
2006-09-04 12:14:57 +08:00
|
|
|
// Must run branch selection immediately preceding the asm printer.
|
2014-12-12 05:26:47 +08:00
|
|
|
addPass(createPPCBranchSelectionPass(), false);
|
2004-08-11 15:40:04 +08:00
|
|
|
}
|
|
|
|
|
2015-02-01 21:20:00 +08:00
|
|
|
TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() {
|
2015-09-17 07:38:13 +08:00
|
|
|
return TargetIRAnalysis([this](const Function &F) {
|
|
|
|
return TargetTransformInfo(PPCTTIImpl(this, F));
|
|
|
|
});
|
2013-01-26 07:05:59 +08:00
|
|
|
}
|