2005-10-16 13:39:50 +08:00
|
|
|
//===-- PPCTargetMachine.cpp - Define TargetMachine for PowerPC -----------===//
|
2005-04-22 07:30:14 +08:00
|
|
|
//
|
2004-06-22 00:55:25 +08:00
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 04:36:04 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2005-04-22 07:30:14 +08:00
|
|
|
//
|
2004-06-22 00:55:25 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
2005-04-22 07:30:14 +08:00
|
|
|
//
|
2005-08-16 07:47:04 +08:00
|
|
|
// Top-level implementation for the PowerPC target.
|
2004-06-22 00:55:25 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2005-10-15 07:59:06 +08:00
|
|
|
#include "PPCTargetMachine.h"
|
2012-03-18 02:46:09 +08:00
|
|
|
#include "PPC.h"
|
2015-01-14 19:23:27 +08:00
|
|
|
#include "PPCTargetObjectFile.h"
|
2015-01-31 19:17:59 +08:00
|
|
|
#include "PPCTargetTransformInfo.h"
|
2012-02-03 13:12:41 +08:00
|
|
|
#include "llvm/CodeGen/Passes.h"
|
2014-10-06 14:45:36 +08:00
|
|
|
#include "llvm/IR/Function.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/MC/MCStreamer.h"
|
|
|
|
#include "llvm/PassManager.h"
|
2012-06-08 23:38:21 +08:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
2009-07-15 04:18:05 +08:00
|
|
|
#include "llvm/Support/FormattedStream.h"
|
2011-08-25 02:08:43 +08:00
|
|
|
#include "llvm/Support/TargetRegistry.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Target/TargetOptions.h"
|
2014-11-21 12:35:51 +08:00
|
|
|
#include "llvm/Transforms/Scalar.h"
|
2004-06-22 00:55:25 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2012-06-08 23:38:21 +08:00
|
|
|
static cl::
|
2012-06-09 03:19:53 +08:00
|
|
|
opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
|
|
|
|
cl::desc("Disable CTR loops for PPC"));
|
2012-06-08 23:38:21 +08:00
|
|
|
|
[PowerPC] Select between VSX A-type and M-type FMA instructions just before RA
The VSX instruction set has two types of FMA instructions: A-type (where the
addend is taken from the output register) and M-type (where one of the product
operands is taken from the output register). This adds a small pass that runs
just after MI scheduling (and, thus, just before register allocation) that
mutates A-type instructions (that are created during isel) into M-type
instructions when:
1. This will eliminate an otherwise-necessary copy of the addend
2. One of the product operands is killed by the instruction
The "right" moment to make this decision is in between scheduling and register
allocation, because only there do we know whether or not one of the product
operands is killed by any particular instruction. Unfortunately, this also
makes the implementation somewhat complicated, because the MIs are not in SSA
form and we need to preserve the LiveIntervals analysis.
As a simple example, if we have:
%vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
%RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
...
%vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
%RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
...
We can eliminate the copy by changing from the A-type to the
M-type instruction. This means:
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
%RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
is replaced by:
%vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
%RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
llvm-svn: 204768
2014-03-26 07:29:21 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
VSXFMAMutateEarly("schedule-ppc-vsx-fma-mutation-early",
|
|
|
|
cl::Hidden, cl::desc("Schedule VSX FMA instruction mutation early"));
|
|
|
|
|
2014-11-21 12:35:51 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
EnableGEPOpt("ppc-gep-opt", cl::Hidden,
|
|
|
|
cl::desc("Enable optimizations on complex GEPs"),
|
|
|
|
cl::init(true));
|
|
|
|
|
2009-07-25 14:49:55 +08:00
|
|
|
extern "C" void LLVMInitializePowerPCTarget() {
|
|
|
|
// Register the targets
|
2012-02-03 13:12:30 +08:00
|
|
|
RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);
|
2009-07-25 14:49:55 +08:00
|
|
|
RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target);
|
2013-07-26 09:35:43 +08:00
|
|
|
RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget);
|
2009-07-25 14:49:55 +08:00
|
|
|
}
|
2009-06-17 04:12:29 +08:00
|
|
|
|
2015-01-27 03:03:15 +08:00
|
|
|
/// Return the datalayout string of a subtarget.
|
|
|
|
static std::string getDataLayoutString(const Triple &T) {
|
|
|
|
bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le;
|
|
|
|
std::string Ret;
|
|
|
|
|
|
|
|
// Most PPC* platforms are big endian, PPC64LE is little endian.
|
|
|
|
if (T.getArch() == Triple::ppc64le)
|
|
|
|
Ret = "e";
|
|
|
|
else
|
|
|
|
Ret = "E";
|
|
|
|
|
|
|
|
Ret += DataLayout::getManglingComponent(T);
|
|
|
|
|
|
|
|
// PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
|
|
|
|
// pointers.
|
|
|
|
if (!is64Bit || T.getOS() == Triple::Lv2)
|
|
|
|
Ret += "-p:32:32";
|
|
|
|
|
|
|
|
// Note, the alignment values for f64 and i64 on ppc64 in Darwin
|
|
|
|
// documentation are wrong; these are correct (i.e. "what gcc does").
|
|
|
|
if (is64Bit || !T.isOSDarwin())
|
|
|
|
Ret += "-i64:64";
|
|
|
|
else
|
|
|
|
Ret += "-f64:32:64";
|
|
|
|
|
|
|
|
// PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
|
|
|
|
if (is64Bit)
|
|
|
|
Ret += "-n32:64";
|
|
|
|
else
|
|
|
|
Ret += "-n32";
|
|
|
|
|
|
|
|
return Ret;
|
|
|
|
}
|
|
|
|
|
2014-10-02 04:38:26 +08:00
|
|
|
static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL, StringRef TT) {
|
|
|
|
std::string FullFS = FS;
|
|
|
|
Triple TargetTriple(TT);
|
|
|
|
|
|
|
|
// Make sure 64-bit features are available when CPUname is generic
|
|
|
|
if (TargetTriple.getArch() == Triple::ppc64 ||
|
|
|
|
TargetTriple.getArch() == Triple::ppc64le) {
|
|
|
|
if (!FullFS.empty())
|
|
|
|
FullFS = "+64bit," + FullFS;
|
|
|
|
else
|
|
|
|
FullFS = "+64bit";
|
|
|
|
}
|
|
|
|
|
|
|
|
if (OL >= CodeGenOpt::Default) {
|
|
|
|
if (!FullFS.empty())
|
|
|
|
FullFS = "+crbits," + FullFS;
|
|
|
|
else
|
|
|
|
FullFS = "+crbits";
|
|
|
|
}
|
[PowerPC] Loosen ELFv1 PPC64 func descriptor loads for indirect calls
Function pointers under PPC64 ELFv1 (which is used on PPC64/Linux on the
POWER7, A2 and earlier cores) are really pointers to a function descriptor, a
structure with three pointers: the actual pointer to the code to which to jump,
the pointer to the TOC needed by the callee, and an environment pointer. We
used to chain these loads, and make them opaque to the rest of the optimizer,
so that they'd always occur directly before the call. This is not necessary,
and in fact, highly suboptimal on embedded cores. Once the function pointer is
known, the loads can be performed ahead of time; in fact, they can be hoisted
out of loops.
Now these function descriptors are almost always generated by the linker, and
thus the contents of the descriptors are invariant. As a result, by default,
we'll mark the associated loads as invariant (allowing them to be hoisted out
of loops). I've added a target feature to turn this off, however, just in case
someone needs that option (constructing an on-stack descriptor, casting it to a
function pointer, and then calling it cannot be well-defined C/C++ code, but I
can imagine some JIT-compilation system doing so).
Consider this simple test:
$ cat call.c
typedef void (*fp)();
void bar(fp x) {
for (int i = 0; i < 1600000000; ++i)
x();
}
$ cat main.c
typedef void (*fp)();
void bar(fp x);
void foo() {}
int main() {
bar(foo);
}
On the PPC A2 (the BG/Q supercomputer), marking the function-descriptor loads
as invariant brings the execution time down to ~8 seconds from ~32 seconds with
the loads in the loop.
The difference on the POWER7 is smaller. Compiling with:
gcc -std=c99 -O3 -mcpu=native call.c main.c : ~6 seconds [this is 4.8.2]
clang -O3 -mcpu=native call.c main.c : ~5.3 seconds
clang -O3 -mcpu=native call.c main.c -mno-invariant-function-descriptors : ~4 seconds
(looks like we'd benefit from additional loop unrolling here, as a first
guess, because this is faster with the extra loads)
The -mno-invariant-function-descriptors will be added to Clang shortly.
llvm-svn: 226207
2015-01-16 05:17:34 +08:00
|
|
|
|
|
|
|
if (OL != CodeGenOpt::None) {
|
|
|
|
if (!FullFS.empty())
|
|
|
|
FullFS = "+invariant-function-descriptors," + FullFS;
|
|
|
|
else
|
|
|
|
FullFS = "+invariant-function-descriptors";
|
|
|
|
}
|
|
|
|
|
2014-10-02 04:38:26 +08:00
|
|
|
return FullFS;
|
|
|
|
}
|
|
|
|
|
2014-11-13 17:26:31 +08:00
|
|
|
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
|
|
|
|
// If it isn't a Mach-O file then it's going to be a linux ELF
|
|
|
|
// object file.
|
|
|
|
if (TT.isOSDarwin())
|
|
|
|
return make_unique<TargetLoweringObjectFileMachO>();
|
|
|
|
|
|
|
|
return make_unique<PPC64LinuxTargetObjectFile>();
|
|
|
|
}
|
|
|
|
|
2014-10-02 04:38:26 +08:00
|
|
|
// The FeatureString here is a little subtle. We are modifying the feature string
|
|
|
|
// with what are (currently) non-function specific overrides as it goes into the
|
|
|
|
// LLVMTargetMachine constructor and then using the stored value in the
|
|
|
|
// Subtarget constructor below it.
|
2014-06-11 08:53:17 +08:00
|
|
|
PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT, StringRef CPU,
|
|
|
|
StringRef FS, const TargetOptions &Options,
|
2011-07-20 15:51:56 +08:00
|
|
|
Reloc::Model RM, CodeModel::Model CM,
|
2014-08-09 12:38:56 +08:00
|
|
|
CodeGenOpt::Level OL)
|
2014-10-02 04:38:26 +08:00
|
|
|
: LLVMTargetMachine(T, TT, CPU, computeFSAdditions(FS, OL, TT), Options, RM,
|
|
|
|
CM, OL),
|
2014-11-13 17:26:31 +08:00
|
|
|
TLOF(createTLOF(Triple(getTargetTriple()))),
|
2015-01-27 03:03:15 +08:00
|
|
|
DL(getDataLayoutString(Triple(TT))), Subtarget(TT, CPU, TargetFS, *this) {
|
2013-05-13 09:16:13 +08:00
|
|
|
initAsmInfo();
|
2005-10-16 13:39:50 +08:00
|
|
|
}
|
|
|
|
|
2014-11-21 07:37:18 +08:00
|
|
|
PPCTargetMachine::~PPCTargetMachine() {}
|
|
|
|
|
2011-12-20 10:50:00 +08:00
|
|
|
void PPC32TargetMachine::anchor() { }
|
|
|
|
|
2012-02-03 13:12:30 +08:00
|
|
|
PPC32TargetMachine::PPC32TargetMachine(const Target &T, StringRef TT,
|
2011-07-20 15:51:56 +08:00
|
|
|
StringRef CPU, StringRef FS,
|
2011-12-03 06:16:29 +08:00
|
|
|
const TargetOptions &Options,
|
2011-11-16 16:38:26 +08:00
|
|
|
Reloc::Model RM, CodeModel::Model CM,
|
|
|
|
CodeGenOpt::Level OL)
|
2014-08-09 12:38:56 +08:00
|
|
|
: PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
|
2006-06-16 09:37:27 +08:00
|
|
|
}
|
|
|
|
|
2011-12-20 10:50:00 +08:00
|
|
|
void PPC64TargetMachine::anchor() { }
|
2006-06-16 09:37:27 +08:00
|
|
|
|
2012-02-03 13:12:30 +08:00
|
|
|
PPC64TargetMachine::PPC64TargetMachine(const Target &T, StringRef TT,
|
2011-07-20 15:51:56 +08:00
|
|
|
StringRef CPU, StringRef FS,
|
2011-12-03 06:16:29 +08:00
|
|
|
const TargetOptions &Options,
|
2011-11-16 16:38:26 +08:00
|
|
|
Reloc::Model RM, CodeModel::Model CM,
|
|
|
|
CodeGenOpt::Level OL)
|
2014-08-09 12:38:56 +08:00
|
|
|
: PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
|
2006-06-16 09:37:27 +08:00
|
|
|
}
|
|
|
|
|
2014-10-06 14:45:36 +08:00
|
|
|
const PPCSubtarget *
|
|
|
|
PPCTargetMachine::getSubtargetImpl(const Function &F) const {
|
|
|
|
AttributeSet FnAttrs = F.getAttributes();
|
|
|
|
Attribute CPUAttr =
|
|
|
|
FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
|
|
|
|
Attribute FSAttr =
|
|
|
|
FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
|
|
|
|
|
|
|
|
std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
|
|
|
|
? CPUAttr.getValueAsString().str()
|
|
|
|
: TargetCPU;
|
|
|
|
std::string FS = !FSAttr.hasAttribute(Attribute::None)
|
|
|
|
? FSAttr.getValueAsString().str()
|
|
|
|
: TargetFS;
|
|
|
|
|
|
|
|
auto &I = SubtargetMap[CPU + FS];
|
|
|
|
if (!I) {
|
|
|
|
// This needs to be done before we create a new subtarget since any
|
|
|
|
// creation will depend on the TM and the code generation flags on the
|
|
|
|
// function that reside in TargetOptions.
|
|
|
|
resetTargetOptions(F);
|
|
|
|
I = llvm::make_unique<PPCSubtarget>(TargetTriple, CPU, FS, *this);
|
|
|
|
}
|
|
|
|
return I.get();
|
|
|
|
}
|
2004-08-11 15:40:04 +08:00
|
|
|
|
2006-09-04 12:14:57 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Pass Pipeline Configuration
|
|
|
|
//===----------------------------------------------------------------------===//
|
2004-08-11 15:40:04 +08:00
|
|
|
|
2012-02-03 13:12:41 +08:00
|
|
|
namespace {
|
|
|
|
/// PPC Code Generator Pass Configuration Options.
|
|
|
|
class PPCPassConfig : public TargetPassConfig {
|
|
|
|
public:
|
2012-02-04 10:56:59 +08:00
|
|
|
PPCPassConfig(PPCTargetMachine *TM, PassManagerBase &PM)
|
|
|
|
: TargetPassConfig(TM, PM) {}
|
2012-02-03 13:12:41 +08:00
|
|
|
|
|
|
|
PPCTargetMachine &getPPCTargetMachine() const {
|
|
|
|
return getTM<PPCTargetMachine>();
|
|
|
|
}
|
|
|
|
|
2014-09-24 04:46:49 +08:00
|
|
|
void addIRPasses() override;
|
2014-04-29 15:57:37 +08:00
|
|
|
bool addPreISel() override;
|
|
|
|
bool addILPOpts() override;
|
|
|
|
bool addInstSelector() override;
|
2014-12-12 05:26:47 +08:00
|
|
|
void addPreRegAlloc() override;
|
|
|
|
void addPreSched2() override;
|
|
|
|
void addPreEmitPass() override;
|
2012-02-03 13:12:41 +08:00
|
|
|
};
|
|
|
|
} // namespace
|
|
|
|
|
2012-02-04 10:56:59 +08:00
|
|
|
TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
|
2012-06-09 11:14:50 +08:00
|
|
|
return new PPCPassConfig(this, PM);
|
2012-02-03 13:12:41 +08:00
|
|
|
}
|
|
|
|
|
2014-09-24 04:46:49 +08:00
|
|
|
void PPCPassConfig::addIRPasses() {
|
|
|
|
addPass(createAtomicExpandPass(&getPPCTargetMachine()));
|
2014-11-21 12:35:51 +08:00
|
|
|
|
|
|
|
if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
|
|
|
|
// Call SeparateConstOffsetFromGEP pass to extract constants within indices
|
|
|
|
// and lower a GEP with multiple indices to either arithmetic operations or
|
|
|
|
// multiple GEPs with single index.
|
|
|
|
addPass(createSeparateConstOffsetFromGEPPass(TM, true));
|
|
|
|
// Call EarlyCSE pass to find and remove subexpressions in the lowered
|
|
|
|
// result.
|
|
|
|
addPass(createEarlyCSEPass());
|
|
|
|
// Do loop invariant code motion in case part of the lowered result is
|
|
|
|
// invariant.
|
|
|
|
addPass(createLICMPass());
|
|
|
|
}
|
|
|
|
|
2014-09-24 04:46:49 +08:00
|
|
|
TargetPassConfig::addIRPasses();
|
|
|
|
}
|
|
|
|
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
llvm-svn: 181927
2013-05-16 05:37:41 +08:00
|
|
|
bool PPCPassConfig::addPreISel() {
|
2012-06-09 03:19:53 +08:00
|
|
|
if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
llvm-svn: 181927
2013-05-16 05:37:41 +08:00
|
|
|
addPass(createPPCCTRLoops(getPPCTargetMachine()));
|
2012-06-08 23:38:21 +08:00
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2013-04-06 07:29:01 +08:00
|
|
|
bool PPCPassConfig::addILPOpts() {
|
2014-05-22 07:40:26 +08:00
|
|
|
addPass(&EarlyIfConverterID);
|
|
|
|
return true;
|
2013-04-06 07:29:01 +08:00
|
|
|
}
|
|
|
|
|
2012-02-03 13:12:41 +08:00
|
|
|
bool PPCPassConfig::addInstSelector() {
|
2005-08-18 03:33:30 +08:00
|
|
|
// Install an instruction selector.
|
2012-07-03 03:48:31 +08:00
|
|
|
addPass(createPPCISelDag(getPPCTargetMachine()));
|
Add a PPCCTRLoops verification pass
When asserts are enabled, this adds a verification pass for PPC counter-loop
formation. Unfortunately, without sacrificing code quality, there is no better
way of forming counter-based loops except at the (late) IR level. This means
that we need to recognize, at the IR level, anything which might turn into a
function call (or indirect branch). Because this is currently a finite set of
things, and because SelectionDAG lowering is basic-block local, this can be
done. Nevertheless, it is fragile, and failure results in a miscompile. This
verification pass checks that all (reachable) counter-based branches are
dominated by a loop mtctr instruction, and that no instructions in between
clobber the counter register. If these conditions are not satisfied, then an
ICE will be triggered.
In short, this is to help us sleep better at night.
llvm-svn: 182295
2013-05-21 00:08:17 +08:00
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
|
|
|
|
addPass(createPPCCTRLoopsVerify());
|
|
|
|
#endif
|
|
|
|
|
2014-05-22 09:21:35 +08:00
|
|
|
addPass(createPPCVSXCopyPass());
|
2006-09-04 12:14:57 +08:00
|
|
|
return false;
|
|
|
|
}
|
2004-08-11 15:40:04 +08:00
|
|
|
|
2014-12-12 05:26:47 +08:00
|
|
|
void PPCPassConfig::addPreRegAlloc() {
|
2014-05-22 09:21:35 +08:00
|
|
|
initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
|
|
|
|
insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
|
|
|
|
&PPCVSXFMAMutateID);
|
[PowerPC] Yet another approach to __tls_get_addr
This patch is a third attempt to properly handle the local-dynamic and
global-dynamic TLS models.
In my original implementation, calls to __tls_get_addr were hidden
from view until the asm-printer phase, at which point the underlying
branch-and-link instruction was created with proper relocations. This
mostly worked well, but I used some repellent techniques to ensure
that the TLS_GET_ADDR nodes at the SD and MI levels correctly received
input from GPR3 and produced output into GPR3. This proved to work
badly in the presence of multiple TLS variable accesses, with the
copies to and from GPR3 being scheduled incorrectly and generally
creating havoc.
In r221703, I addressed that problem by representing the calls to
__tls_get_addr as true calls during instruction lowering. This had
the advantage of removing all of the bad hacks and relying on the
existing call machinery to properly glue the copies in place. It
looked like this was going to be the right way to go.
However, as a side effect of the recent discovery of problems with
linker optimizations for TLS, we discovered cases of suboptimal code
generation with this strategy. The problem comes when tls_get_addr is
called for the same address, and there is a resulting CSE
opportunity. It turns out that in such cases MachineCSE will common
the addis/addi instructions that set up the input value to
tls_get_addr, but will not common the calls themselves. MachineCSE
does not have any machinery to common idempotent calls. This is
perfectly sensible, since presumably this would be done at the IR
level, and introducing calls in the back end isn't commonplace. In
any case, we end up with two calls to __tls_get_addr when one would
suffice, and that isn't good.
I presumed that the original design would have allowed commoning of
the machine-specific nodes that hid the __tls_get_addr calls, so as
suggested by Ulrich Weigand, I went back to that design and cleaned it
up so that the copies were properly held together by glue
nodes. However, it turned out that this didn't work either...the
presence of copies to physical registers kept the machine-specific
nodes from being commoned also.
All of which leads to the design presented here. This is a return to
the original design, except that no attempt is made to introduce
copies to and from GPR3 during instruction lowering. Virtual registers
are used until prior to register allocation. At that point, a special
pass is run that identifies the machine-specific nodes that hide the
tls_get_addr calls and introduces the copies to and from GPR3 around
them. The register allocator then coalesces these copies away. With
this design, MachineCSE succeeds in commoning tls_get_addr calls where
possible, and we get nice optimal code generation (better than GCC at
the moment, which does not common these calls).
One additional problem must be dealt with: After introducing the
mentions of the physical register GPR3, the aggressive anti-dependence
breaker sees opportunities to improve scheduling by selecting a
different register instead. Flags must be used on the instruction
descriptions to tell the anti-dependence breaker to keep its hands in
its pockets.
One thing missing from the original design was recording a definition
of the link register on the GET_TLS_ADDR nodes. Doing this was found
to be insufficient to force a stack frame to be created, which led to
looping behavior because two different LR values were stored at the
same address. This appears to have been an oversight in
PPCFrameLowering::determineFrameLayout(), which is repaired here.
Because MustSaveLR() returns true for calls to builtin_return_address,
this changed the expected behavior of
test/CodeGen/PowerPC/retaddr2.ll, which now stacks a frame but
formerly did not. I've fixed the test case to reflect this.
There are existing TLS tests to catch regressions; the checks in
test/CodeGen/PowerPC/tls-store2.ll proved to be too restrictive in the
face of instruction scheduling with these changes, so I fixed that
up.
I've added a new test case based on the PrettyStackTrace module that
demonstrated the original problem. This checks that we get correct
code generation and that CSE of the calls to __get_tls_addr has taken
place.
llvm-svn: 227976
2015-02-04 00:16:01 +08:00
|
|
|
addPass(createPPCTLSDynamicCallPass());
|
[PowerPC] Select between VSX A-type and M-type FMA instructions just before RA
The VSX instruction set has two types of FMA instructions: A-type (where the
addend is taken from the output register) and M-type (where one of the product
operands is taken from the output register). This adds a small pass that runs
just after MI scheduling (and, thus, just before register allocation) that
mutates A-type instructions (that are created during isel) into M-type
instructions when:
1. This will eliminate an otherwise-necessary copy of the addend
2. One of the product operands is killed by the instruction
The "right" moment to make this decision is in between scheduling and register
allocation, because only there do we know whether or not one of the product
operands is killed by any particular instruction. Unfortunately, this also
makes the implementation somewhat complicated, because the MIs are not in SSA
form and we need to preserve the LiveIntervals analysis.
As a simple example, if we have:
%vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
%RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
...
%vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
%RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
...
We can eliminate the copy by changing from the A-type to the
M-type instruction. This means:
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
%RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
is replaced by:
%vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
%RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
llvm-svn: 204768
2014-03-26 07:29:21 +08:00
|
|
|
}
|
|
|
|
|
2014-12-12 05:26:47 +08:00
|
|
|
void PPCPassConfig::addPreSched2() {
|
2013-04-10 06:58:37 +08:00
|
|
|
if (getOptLevel() != CodeGenOpt::None)
|
|
|
|
addPass(&IfConverterID);
|
|
|
|
}
|
|
|
|
|
2014-12-12 05:26:47 +08:00
|
|
|
void PPCPassConfig::addPreEmitPass() {
|
2013-04-09 00:24:03 +08:00
|
|
|
if (getOptLevel() != CodeGenOpt::None)
|
2014-12-12 05:26:47 +08:00
|
|
|
addPass(createPPCEarlyReturnPass(), false);
|
2006-09-04 12:14:57 +08:00
|
|
|
// Must run branch selection immediately preceding the asm printer.
|
2014-12-12 05:26:47 +08:00
|
|
|
addPass(createPPCBranchSelectionPass(), false);
|
2004-08-11 15:40:04 +08:00
|
|
|
}
|
|
|
|
|
2015-02-01 21:20:00 +08:00
|
|
|
TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() {
|
|
|
|
return TargetIRAnalysis(
|
|
|
|
[this](Function &F) { return TargetTransformInfo(PPCTTIImpl(this, F)); });
|
2013-01-26 07:05:59 +08:00
|
|
|
}
|