2005-10-16 13:39:50 +08:00
|
|
|
//===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===//
|
2005-08-18 03:33:03 +08:00
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 04:36:04 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2005-08-18 03:33:03 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2005-10-16 13:39:50 +08:00
|
|
|
// This file defines a pattern matching instruction selector for PowerPC,
|
2005-08-18 03:33:03 +08:00
|
|
|
// converting from a legalized dag to a PPC dag.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2006-12-20 06:59:26 +08:00
|
|
|
#define DEBUG_TYPE "ppc-codegen"
|
2005-10-15 07:51:18 +08:00
|
|
|
#include "PPC.h"
|
2011-07-26 08:24:13 +08:00
|
|
|
#include "MCTargetDesc/PPCPredicates.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "PPCTargetMachine.h"
|
2005-08-20 06:38:53 +08:00
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
2007-12-31 12:13:23 +08:00
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2005-08-18 03:33:03 +08:00
|
|
|
#include "llvm/CodeGen/SelectionDAG.h"
|
|
|
|
#include "llvm/CodeGen/SelectionDAGISel.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/Constants.h"
|
|
|
|
#include "llvm/IR/Function.h"
|
2013-01-19 16:03:47 +08:00
|
|
|
#include "llvm/IR/GlobalAlias.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/GlobalValue.h"
|
|
|
|
#include "llvm/IR/GlobalVariable.h"
|
|
|
|
#include "llvm/IR/Intrinsics.h"
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
2005-08-18 03:33:03 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2009-07-09 04:53:28 +08:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Support/MathExtras.h"
|
2009-07-09 04:53:28 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Target/TargetOptions.h"
|
2005-08-18 03:33:03 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
// FIXME: Remove this once the bug has been fixed!
|
|
|
|
cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
|
|
|
|
cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
|
|
|
|
|
2013-02-14 01:40:07 +08:00
|
|
|
namespace llvm {
|
|
|
|
void initializePPCDAGToDAGISelPass(PassRegistry&);
|
|
|
|
}
|
|
|
|
|
2005-08-18 03:33:03 +08:00
|
|
|
namespace {
|
|
|
|
//===--------------------------------------------------------------------===//
|
2005-10-18 08:28:58 +08:00
|
|
|
/// PPCDAGToDAGISel - PPC specific code to select PPC machine
|
2005-08-18 03:33:03 +08:00
|
|
|
/// instructions for SelectionDAG operations.
|
|
|
|
///
|
2009-10-25 14:33:48 +08:00
|
|
|
class PPCDAGToDAGISel : public SelectionDAGISel {
|
2010-04-17 23:26:15 +08:00
|
|
|
const PPCTargetMachine &TM;
|
|
|
|
const PPCTargetLowering &PPCLowering;
|
2007-10-23 14:42:42 +08:00
|
|
|
const PPCSubtarget &PPCSubTarget;
|
2005-08-20 06:38:53 +08:00
|
|
|
unsigned GlobalBaseReg;
|
2005-08-18 03:33:03 +08:00
|
|
|
public:
|
2008-07-08 02:00:37 +08:00
|
|
|
explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
|
2009-01-16 03:20:50 +08:00
|
|
|
: SelectionDAGISel(tm), TM(tm),
|
2007-10-23 14:42:42 +08:00
|
|
|
PPCLowering(*TM.getTargetLowering()),
|
2013-02-14 01:40:07 +08:00
|
|
|
PPCSubTarget(*TM.getSubtargetImpl()) {
|
|
|
|
initializePPCDAGToDAGISelPass(*PassRegistry::getPassRegistry());
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2009-08-01 02:16:33 +08:00
|
|
|
virtual bool runOnMachineFunction(MachineFunction &MF) {
|
2005-08-20 06:38:53 +08:00
|
|
|
// Make sure we re-emit a set of the global base reg if necessary
|
|
|
|
GlobalBaseReg = 0;
|
2009-08-01 02:16:33 +08:00
|
|
|
SelectionDAGISel::runOnMachineFunction(MF);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2012-10-11 04:54:15 +08:00
|
|
|
if (!PPCSubTarget.isSVR4ABI())
|
|
|
|
InsertVRSaveCode(MF);
|
|
|
|
|
2006-03-17 02:25:23 +08:00
|
|
|
return true;
|
2005-08-20 06:38:53 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
virtual void PostprocessISelDAG();
|
|
|
|
|
2005-08-18 03:33:03 +08:00
|
|
|
/// getI32Imm - Return a target constant with the specified value, of type
|
|
|
|
/// i32.
|
2008-07-28 05:46:04 +08:00
|
|
|
inline SDValue getI32Imm(unsigned Imm) {
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->getTargetConstant(Imm, MVT::i32);
|
2005-08-18 03:33:03 +08:00
|
|
|
}
|
2005-08-20 06:38:53 +08:00
|
|
|
|
2006-06-27 08:04:13 +08:00
|
|
|
/// getI64Imm - Return a target constant with the specified value, of type
|
|
|
|
/// i64.
|
2008-07-28 05:46:04 +08:00
|
|
|
inline SDValue getI64Imm(uint64_t Imm) {
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->getTargetConstant(Imm, MVT::i64);
|
2006-06-27 08:04:13 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-06-27 08:04:13 +08:00
|
|
|
/// getSmallIPtrImm - Return a target constant of pointer type.
|
2008-07-28 05:46:04 +08:00
|
|
|
inline SDValue getSmallIPtrImm(unsigned Imm) {
|
2006-06-27 08:04:13 +08:00
|
|
|
return CurDAG->getTargetConstant(Imm, PPCLowering.getPointerTy());
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2012-09-27 18:14:43 +08:00
|
|
|
/// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s
|
2006-09-22 13:01:56 +08:00
|
|
|
/// with any number of 0s on either side. The 1s are allowed to wrap from
|
|
|
|
/// LSB to MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.
|
|
|
|
/// 0x0F0F0000 is not, since all 1s are not contiguous.
|
|
|
|
static bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME);
|
|
|
|
|
|
|
|
|
|
|
|
/// isRotateAndMask - Returns true if Mask and Shift can be folded into a
|
|
|
|
/// rotate and mask opcode and mask operation.
|
2009-11-24 09:09:07 +08:00
|
|
|
static bool isRotateAndMask(SDNode *N, unsigned Mask, bool isShiftMask,
|
2006-09-22 13:01:56 +08:00
|
|
|
unsigned &SH, unsigned &MB, unsigned &ME);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-08-20 06:38:53 +08:00
|
|
|
/// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC
|
|
|
|
/// base register. Return the virtual register that holds this value.
|
2006-08-26 13:34:46 +08:00
|
|
|
SDNode *getGlobalBaseReg();
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-08-18 03:33:03 +08:00
|
|
|
// Select - Convert the specified operand from a target-independent to a
|
|
|
|
// target-specific node if it hasn't already been changed.
|
2010-01-05 09:24:18 +08:00
|
|
|
SDNode *Select(SDNode *N);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-08-19 08:38:14 +08:00
|
|
|
SDNode *SelectBitfieldInsert(SDNode *N);
|
|
|
|
|
2005-08-22 02:50:37 +08:00
|
|
|
/// SelectCC - Select a comparison of the specified values with the
|
|
|
|
/// specified condition code, returning the CR# of the expression.
|
2013-05-25 10:42:55 +08:00
|
|
|
SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDLoc dl);
|
2005-08-22 02:50:37 +08:00
|
|
|
|
2005-12-20 07:25:09 +08:00
|
|
|
/// SelectAddrImm - Returns true if the address N can be represented by
|
|
|
|
/// a base register plus a signed 16-bit displacement [r+imm].
|
2010-09-22 04:31:19 +08:00
|
|
|
bool SelectAddrImm(SDValue N, SDValue &Disp,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue &Base) {
|
2013-05-17 01:58:02 +08:00
|
|
|
return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
|
2006-11-08 10:15:41 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-11-16 08:41:37 +08:00
|
|
|
/// SelectAddrImmOffs - Return true if the operand is valid for a preinc
|
2013-03-22 22:58:17 +08:00
|
|
|
/// immediate field. Note that the operand at this point is already the
|
|
|
|
/// result of a prior SelectAddressRegImm call.
|
2010-09-22 04:31:19 +08:00
|
|
|
bool SelectAddrImmOffs(SDValue N, SDValue &Out) const {
|
2013-03-22 22:58:17 +08:00
|
|
|
if (N.getOpcode() == ISD::TargetConstant ||
|
2012-06-22 04:10:48 +08:00
|
|
|
N.getOpcode() == ISD::TargetGlobalAddress) {
|
2012-06-19 10:34:32 +08:00
|
|
|
Out = N;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2005-12-20 07:25:09 +08:00
|
|
|
/// SelectAddrIdx - Given the specified addressed, check to see if it can be
|
|
|
|
/// represented as an indexed [r+r] operation. Returns false if it can
|
|
|
|
/// be represented by [r+imm], which are preferred.
|
2010-09-22 04:31:19 +08:00
|
|
|
bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) {
|
2006-11-08 10:15:41 +08:00
|
|
|
return PPCLowering.SelectAddressRegReg(N, Base, Index, *CurDAG);
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-12-20 07:25:09 +08:00
|
|
|
/// SelectAddrIdxOnly - Given the specified addressed, force it to be
|
|
|
|
/// represented as an indexed [r+r] operation.
|
2010-09-22 04:31:19 +08:00
|
|
|
bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) {
|
2006-11-08 10:15:41 +08:00
|
|
|
return PPCLowering.SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
|
|
|
|
}
|
2005-08-22 06:31:09 +08:00
|
|
|
|
2013-05-17 01:58:02 +08:00
|
|
|
/// SelectAddrImmX4 - Returns true if the address N can be represented by
|
|
|
|
/// a base register plus a signed 16-bit displacement that is a multiple of 4.
|
|
|
|
/// Suitable for use by STD and friends.
|
|
|
|
bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
|
|
|
|
return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
|
2006-11-08 10:15:41 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2013-03-22 05:37:52 +08:00
|
|
|
// Select an address into a single register.
|
|
|
|
bool SelectAddr(SDValue N, SDValue &Base) {
|
|
|
|
Base = N;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2006-02-24 10:13:12 +08:00
|
|
|
/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
|
2009-08-18 08:18:39 +08:00
|
|
|
/// inline asm expressions. It is always correct to compute the value into
|
|
|
|
/// a register. The case of adding a (possibly relocatable) constant to a
|
|
|
|
/// register can be improved, but it is wrong to substitute Reg+Reg for
|
|
|
|
/// Reg in an asm, because the load or store opcode would have to change.
|
|
|
|
virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
|
2006-02-24 10:13:12 +08:00
|
|
|
char ConstraintCode,
|
2008-08-23 10:25:05 +08:00
|
|
|
std::vector<SDValue> &OutOps) {
|
2009-08-18 08:18:39 +08:00
|
|
|
OutOps.push_back(Op);
|
2006-02-24 10:13:12 +08:00
|
|
|
return false;
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2009-08-01 02:16:33 +08:00
|
|
|
void InsertVRSaveCode(MachineFunction &MF);
|
2006-03-17 02:25:23 +08:00
|
|
|
|
2005-08-18 03:33:03 +08:00
|
|
|
virtual const char *getPassName() const {
|
|
|
|
return "PowerPC DAG->DAG Pattern Instruction Selection";
|
2010-12-24 12:28:06 +08:00
|
|
|
}
|
|
|
|
|
2005-09-14 06:03:06 +08:00
|
|
|
// Include the pieces autogenerated from the target description.
|
2005-10-15 07:37:35 +08:00
|
|
|
#include "PPCGenDAGISel.inc"
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-10-07 02:45:51 +08:00
|
|
|
private:
|
2010-01-05 09:24:18 +08:00
|
|
|
SDNode *SelectSETCC(SDNode *N);
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
|
|
|
|
void PeepholePPC64();
|
|
|
|
void PeepholdCROps();
|
2014-02-28 14:11:16 +08:00
|
|
|
|
|
|
|
bool AllUsersSelectZero(SDNode *N);
|
|
|
|
void SwapAllSelectUsers(SDNode *N);
|
2005-08-18 03:33:03 +08:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2006-03-17 02:25:23 +08:00
|
|
|
/// InsertVRSaveCode - Once the entire function has been instruction selected,
|
|
|
|
/// all virtual registers are created and all machine instructions are built,
|
|
|
|
/// check to see if we need to save/restore VRSAVE. If so, do it.
|
2009-08-01 02:16:33 +08:00
|
|
|
void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
// Check to see if this function uses vector registers, which means we have to
|
2010-12-24 12:28:06 +08:00
|
|
|
// save and restore the VRSAVE register and update it with the regs we use.
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
//
|
2010-02-11 00:03:48 +08:00
|
|
|
// In this case, there will be virtual registers of vector type created
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
// by the scheduler. Detect them now.
|
|
|
|
bool HasVectorVReg = false;
|
2011-01-09 07:11:11 +08:00
|
|
|
for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) {
|
|
|
|
unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
|
|
|
|
if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) {
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
HasVectorVReg = true;
|
|
|
|
break;
|
|
|
|
}
|
2011-01-09 07:11:11 +08:00
|
|
|
}
|
2006-03-17 02:25:23 +08:00
|
|
|
if (!HasVectorVReg) return; // nothing to do.
|
2010-12-24 12:28:06 +08:00
|
|
|
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
// If we have a vector register, we want to emit code into the entry and exit
|
|
|
|
// blocks to save and restore the VRSAVE register. We do this here (instead
|
|
|
|
// of marking all vector instructions as clobbering VRSAVE) for two reasons:
|
|
|
|
//
|
|
|
|
// 1. This (trivially) reduces the load on the register allocator, by not
|
|
|
|
// having to represent the live range of the VRSAVE register.
|
|
|
|
// 2. This (more significantly) allows us to create a temporary virtual
|
|
|
|
// register to hold the saved VRSAVE value, allowing this temporary to be
|
|
|
|
// register allocated, instead of forcing it to be spilled to the stack.
|
2006-03-17 02:25:23 +08:00
|
|
|
|
|
|
|
// Create two vregs - one to hold the VRSAVE register that is live-in to the
|
|
|
|
// function and one for the value after having bits or'd into it.
|
2007-12-31 12:13:23 +08:00
|
|
|
unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
|
|
|
|
unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-11-28 07:37:22 +08:00
|
|
|
const TargetInstrInfo &TII = *TM.getInstrInfo();
|
2006-03-17 02:25:23 +08:00
|
|
|
MachineBasicBlock &EntryBB = *Fn.begin();
|
2010-04-03 04:16:16 +08:00
|
|
|
DebugLoc dl;
|
2006-03-17 02:25:23 +08:00
|
|
|
// Emit the following code into the entry block:
|
|
|
|
// InVRSAVE = MFVRSAVE
|
|
|
|
// UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE
|
|
|
|
// MTVRSAVE UpdatedVRSAVE
|
|
|
|
MachineBasicBlock::iterator IP = EntryBB.begin(); // Insert Point
|
2009-02-13 10:27:39 +08:00
|
|
|
BuildMI(EntryBB, IP, dl, TII.get(PPC::MFVRSAVE), InVRSAVE);
|
|
|
|
BuildMI(EntryBB, IP, dl, TII.get(PPC::UPDATE_VRSAVE),
|
2008-01-07 09:56:04 +08:00
|
|
|
UpdatedVRSAVE).addReg(InVRSAVE);
|
2009-02-13 10:27:39 +08:00
|
|
|
BuildMI(EntryBB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-03-17 02:25:23 +08:00
|
|
|
// Find all return blocks, outputting a restore in each epilog.
|
|
|
|
for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
|
2011-12-07 15:15:52 +08:00
|
|
|
if (!BB->empty() && BB->back().isReturn()) {
|
2006-03-17 02:25:23 +08:00
|
|
|
IP = BB->end(); --IP;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-03-17 02:25:23 +08:00
|
|
|
// Skip over all terminator instructions, which are part of the return
|
|
|
|
// sequence.
|
|
|
|
MachineBasicBlock::iterator I2 = IP;
|
2011-12-07 15:15:52 +08:00
|
|
|
while (I2 != BB->begin() && (--I2)->isTerminator())
|
2006-03-17 02:25:23 +08:00
|
|
|
IP = I2;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-03-17 02:25:23 +08:00
|
|
|
// Emit: MTVRSAVE InVRSave
|
2009-02-13 10:27:39 +08:00
|
|
|
BuildMI(*BB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE);
|
2010-12-24 12:28:06 +08:00
|
|
|
}
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
}
|
2005-10-07 02:45:51 +08:00
|
|
|
}
|
2005-09-03 09:17:22 +08:00
|
|
|
|
2006-03-17 02:25:23 +08:00
|
|
|
|
2005-08-20 06:38:53 +08:00
|
|
|
/// getGlobalBaseReg - Output the instructions required to put the
|
|
|
|
/// base address to use for accessing globals into a register.
|
|
|
|
///
|
2006-08-26 13:34:46 +08:00
|
|
|
SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
|
2005-08-20 06:38:53 +08:00
|
|
|
if (!GlobalBaseReg) {
|
2006-11-28 07:37:22 +08:00
|
|
|
const TargetInstrInfo &TII = *TM.getInstrInfo();
|
2005-08-20 06:38:53 +08:00
|
|
|
// Insert the set of GlobalBaseReg into the first MBB of the function
|
2009-08-15 10:07:36 +08:00
|
|
|
MachineBasicBlock &FirstMBB = MF->front();
|
2005-08-20 06:38:53 +08:00
|
|
|
MachineBasicBlock::iterator MBBI = FirstMBB.begin();
|
2010-04-03 04:16:16 +08:00
|
|
|
DebugLoc dl;
|
2006-06-27 08:04:13 +08:00
|
|
|
|
2009-08-12 04:47:22 +08:00
|
|
|
if (PPCLowering.getPointerTy() == MVT::i32) {
|
2014-03-06 09:28:23 +08:00
|
|
|
GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::GPRC_NOR0RegClass);
|
2011-05-19 10:56:28 +08:00
|
|
|
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
|
2009-02-13 10:27:39 +08:00
|
|
|
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
|
2006-11-15 02:43:11 +08:00
|
|
|
} else {
|
2014-03-06 09:28:23 +08:00
|
|
|
GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RC_NOX0RegClass);
|
2011-05-19 10:56:28 +08:00
|
|
|
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8));
|
2009-02-13 10:27:39 +08:00
|
|
|
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
|
2006-11-15 02:43:11 +08:00
|
|
|
}
|
2005-08-20 06:38:53 +08:00
|
|
|
}
|
2008-08-31 23:37:04 +08:00
|
|
|
return CurDAG->getRegister(GlobalBaseReg,
|
|
|
|
PPCLowering.getPointerTy()).getNode();
|
2005-08-20 06:38:53 +08:00
|
|
|
}
|
|
|
|
|
2006-06-27 08:04:13 +08:00
|
|
|
/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
|
|
|
|
/// or 64-bit immediate, and if the value can be accurately represented as a
|
|
|
|
/// sign extension from a 16-bit value. If so, this returns true and the
|
|
|
|
/// immediate.
|
|
|
|
static bool isIntS16Immediate(SDNode *N, short &Imm) {
|
|
|
|
if (N->getOpcode() != ISD::Constant)
|
|
|
|
return false;
|
2005-08-20 06:38:53 +08:00
|
|
|
|
2008-09-13 00:56:44 +08:00
|
|
|
Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getValueType(0) == MVT::i32)
|
2008-09-13 00:56:44 +08:00
|
|
|
return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
|
2006-06-27 08:04:13 +08:00
|
|
|
else
|
2008-09-13 00:56:44 +08:00
|
|
|
return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
|
2006-06-27 08:04:13 +08:00
|
|
|
}
|
|
|
|
|
2008-07-28 05:46:04 +08:00
|
|
|
static bool isIntS16Immediate(SDValue Op, short &Imm) {
|
2008-08-29 05:40:38 +08:00
|
|
|
return isIntS16Immediate(Op.getNode(), Imm);
|
2006-06-27 08:04:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
|
|
|
|
/// operand. If so Imm will receive the 32-bit value.
|
|
|
|
static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
|
2008-09-13 00:56:44 +08:00
|
|
|
Imm = cast<ConstantSDNode>(N)->getZExtValue();
|
2005-08-18 13:00:13 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2006-06-27 08:04:13 +08:00
|
|
|
/// isInt64Immediate - This method tests to see if the node is a 64-bit constant
|
|
|
|
/// operand. If so Imm will receive the 64-bit value.
|
|
|
|
static bool isInt64Immediate(SDNode *N, uint64_t &Imm) {
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) {
|
2008-09-13 00:56:44 +08:00
|
|
|
Imm = cast<ConstantSDNode>(N)->getZExtValue();
|
2006-06-27 08:04:13 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// isInt32Immediate - This method tests to see if a constant operand.
|
|
|
|
// If so Imm will receive the 32 bit value.
|
2008-07-28 05:46:04 +08:00
|
|
|
static bool isInt32Immediate(SDValue N, unsigned &Imm) {
|
2008-08-29 05:40:38 +08:00
|
|
|
return isInt32Immediate(N.getNode(), Imm);
|
2006-06-27 08:04:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// isOpcWithIntImmediate - This method tests to see if the node is a specific
|
|
|
|
// opcode and that it has a immediate integer right operand.
|
|
|
|
// If so Imm will receive the 32 bit value.
|
|
|
|
static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
|
2008-08-31 23:37:04 +08:00
|
|
|
return N->getOpcode() == Opc
|
|
|
|
&& isInt32Immediate(N->getOperand(1).getNode(), Imm);
|
2006-06-27 08:04:13 +08:00
|
|
|
}
|
|
|
|
|
2006-09-22 13:01:56 +08:00
|
|
|
bool PPCDAGToDAGISel::isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
|
2013-07-12 00:31:51 +08:00
|
|
|
if (!Val)
|
|
|
|
return false;
|
|
|
|
|
2005-08-18 15:30:46 +08:00
|
|
|
if (isShiftedMask_32(Val)) {
|
|
|
|
// look for the first non-zero bit
|
2013-05-25 06:23:49 +08:00
|
|
|
MB = countLeadingZeros(Val);
|
2005-08-18 15:30:46 +08:00
|
|
|
// look for the first zero bit after the run of ones
|
2013-05-25 06:23:49 +08:00
|
|
|
ME = countLeadingZeros((Val - 1) ^ Val);
|
2005-08-18 15:30:46 +08:00
|
|
|
return true;
|
2005-08-25 12:47:18 +08:00
|
|
|
} else {
|
|
|
|
Val = ~Val; // invert mask
|
|
|
|
if (isShiftedMask_32(Val)) {
|
|
|
|
// effectively look for the first zero bit
|
2013-05-25 06:23:49 +08:00
|
|
|
ME = countLeadingZeros(Val) - 1;
|
2005-08-25 12:47:18 +08:00
|
|
|
// effectively look for the first one bit after the run of zeros
|
2013-05-25 06:23:49 +08:00
|
|
|
MB = countLeadingZeros((Val - 1) ^ Val) + 1;
|
2005-08-25 12:47:18 +08:00
|
|
|
return true;
|
|
|
|
}
|
2005-08-18 15:30:46 +08:00
|
|
|
}
|
|
|
|
// no run present
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2010-12-24 12:28:06 +08:00
|
|
|
bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask,
|
|
|
|
bool isShiftMask, unsigned &SH,
|
2006-09-22 13:01:56 +08:00
|
|
|
unsigned &MB, unsigned &ME) {
|
2005-10-19 08:05:37 +08:00
|
|
|
// Don't even go down this path for i64, since different logic will be
|
|
|
|
// necessary for rldicl/rldicr/rldimi.
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getValueType(0) != MVT::i32)
|
2005-10-19 08:05:37 +08:00
|
|
|
return false;
|
|
|
|
|
2005-08-18 15:30:46 +08:00
|
|
|
unsigned Shift = 32;
|
|
|
|
unsigned Indeterminant = ~0; // bit mask marking indeterminant results
|
|
|
|
unsigned Opcode = N->getOpcode();
|
2005-08-30 08:59:16 +08:00
|
|
|
if (N->getNumOperands() != 2 ||
|
2008-08-29 05:40:38 +08:00
|
|
|
!isInt32Immediate(N->getOperand(1).getNode(), Shift) || (Shift > 31))
|
2005-08-18 15:30:46 +08:00
|
|
|
return false;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-08-18 15:30:46 +08:00
|
|
|
if (Opcode == ISD::SHL) {
|
|
|
|
// apply shift left to mask if it comes first
|
2009-11-24 09:09:07 +08:00
|
|
|
if (isShiftMask) Mask = Mask << Shift;
|
2005-08-18 15:30:46 +08:00
|
|
|
// determine which bits are made indeterminant by shift
|
|
|
|
Indeterminant = ~(0xFFFFFFFFu << Shift);
|
2010-12-24 12:28:06 +08:00
|
|
|
} else if (Opcode == ISD::SRL) {
|
2005-08-18 15:30:46 +08:00
|
|
|
// apply shift right to mask if it comes first
|
2009-11-24 09:09:07 +08:00
|
|
|
if (isShiftMask) Mask = Mask >> Shift;
|
2005-08-18 15:30:46 +08:00
|
|
|
// determine which bits are made indeterminant by shift
|
|
|
|
Indeterminant = ~(0xFFFFFFFFu >> Shift);
|
|
|
|
// adjust for the left rotate
|
|
|
|
Shift = 32 - Shift;
|
2006-09-22 13:01:56 +08:00
|
|
|
} else if (Opcode == ISD::ROTL) {
|
|
|
|
Indeterminant = 0;
|
2005-08-18 15:30:46 +08:00
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-08-18 15:30:46 +08:00
|
|
|
// if the mask doesn't intersect any Indeterminant bits
|
|
|
|
if (Mask && !(Mask & Indeterminant)) {
|
2006-05-13 00:29:37 +08:00
|
|
|
SH = Shift & 31;
|
2005-08-18 15:30:46 +08:00
|
|
|
// make sure the mask is still a mask (wrap arounds may not be)
|
|
|
|
return isRunOfOnes(Mask, MB, ME);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2005-08-19 08:38:14 +08:00
|
|
|
/// SelectBitfieldInsert - turn an or of two masked values into
|
|
|
|
/// the rotate left word immediate then mask insert (rlwimi) instruction.
|
2005-10-18 08:28:58 +08:00
|
|
|
SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Op0 = N->getOperand(0);
|
|
|
|
SDValue Op1 = N->getOperand(1);
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(N);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2008-02-27 09:23:58 +08:00
|
|
|
APInt LKZ, LKO, RKZ, RKO;
|
2012-04-04 20:51:34 +08:00
|
|
|
CurDAG->ComputeMaskedBits(Op0, LKZ, LKO);
|
|
|
|
CurDAG->ComputeMaskedBits(Op1, RKZ, RKO);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2008-02-27 09:23:58 +08:00
|
|
|
unsigned TargetMask = LKZ.getZExtValue();
|
|
|
|
unsigned InsertMask = RKZ.getZExtValue();
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-05-09 01:38:32 +08:00
|
|
|
if ((TargetMask | InsertMask) == 0xFFFFFFFF) {
|
|
|
|
unsigned Op0Opc = Op0.getOpcode();
|
|
|
|
unsigned Op1Opc = Op1.getOpcode();
|
|
|
|
unsigned Value, SH = 0;
|
|
|
|
TargetMask = ~TargetMask;
|
|
|
|
InsertMask = ~InsertMask;
|
2006-05-07 08:23:38 +08:00
|
|
|
|
2006-05-09 01:38:32 +08:00
|
|
|
// If the LHS has a foldable shift and the RHS does not, then swap it to the
|
|
|
|
// RHS so that we can fold the shift into the insert.
|
2006-05-07 08:23:38 +08:00
|
|
|
if (Op0Opc == ISD::AND && Op1Opc == ISD::AND) {
|
|
|
|
if (Op0.getOperand(0).getOpcode() == ISD::SHL ||
|
|
|
|
Op0.getOperand(0).getOpcode() == ISD::SRL) {
|
|
|
|
if (Op1.getOperand(0).getOpcode() != ISD::SHL &&
|
|
|
|
Op1.getOperand(0).getOpcode() != ISD::SRL) {
|
|
|
|
std::swap(Op0, Op1);
|
|
|
|
std::swap(Op0Opc, Op1Opc);
|
2006-05-09 01:38:32 +08:00
|
|
|
std::swap(TargetMask, InsertMask);
|
2006-05-07 08:23:38 +08:00
|
|
|
}
|
|
|
|
}
|
2006-05-09 01:38:32 +08:00
|
|
|
} else if (Op0Opc == ISD::SHL || Op0Opc == ISD::SRL) {
|
|
|
|
if (Op1Opc == ISD::AND && Op1.getOperand(0).getOpcode() != ISD::SHL &&
|
|
|
|
Op1.getOperand(0).getOpcode() != ISD::SRL) {
|
|
|
|
std::swap(Op0, Op1);
|
|
|
|
std::swap(Op0Opc, Op1Opc);
|
|
|
|
std::swap(TargetMask, InsertMask);
|
|
|
|
}
|
2005-08-19 08:38:14 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-05-07 08:23:38 +08:00
|
|
|
unsigned MB, ME;
|
2013-07-12 00:31:51 +08:00
|
|
|
if (isRunOfOnes(InsertMask, MB, ME)) {
|
2009-11-21 06:16:40 +08:00
|
|
|
SDValue Tmp1, Tmp2;
|
2006-05-07 08:23:38 +08:00
|
|
|
|
|
|
|
if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) &&
|
2006-06-27 08:04:13 +08:00
|
|
|
isInt32Immediate(Op1.getOperand(1), Value)) {
|
2006-05-07 08:23:38 +08:00
|
|
|
Op1 = Op1.getOperand(0);
|
|
|
|
SH = (Op1Opc == ISD::SHL) ? Value : 32 - Value;
|
2005-08-19 08:38:14 +08:00
|
|
|
}
|
2006-05-07 08:23:38 +08:00
|
|
|
if (Op1Opc == ISD::AND) {
|
|
|
|
unsigned SHOpc = Op1.getOperand(0).getOpcode();
|
|
|
|
if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) &&
|
2006-06-27 08:04:13 +08:00
|
|
|
isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) {
|
2013-06-29 04:00:07 +08:00
|
|
|
// Note that Value must be in range here (less than 32) because
|
|
|
|
// otherwise there would not be any bits set in InsertMask.
|
2006-05-07 08:23:38 +08:00
|
|
|
Op1 = Op1.getOperand(0).getOperand(0);
|
|
|
|
SH = (SHOpc == ISD::SHL) ? Value : 32 - Value;
|
|
|
|
}
|
2005-08-19 08:38:14 +08:00
|
|
|
}
|
2009-11-21 06:16:40 +08:00
|
|
|
|
2006-05-13 00:29:37 +08:00
|
|
|
SH &= 31;
|
2009-11-21 06:16:40 +08:00
|
|
|
SDValue Ops[] = { Op0, Op1, getI32Imm(SH), getI32Imm(MB),
|
2006-08-27 16:14:06 +08:00
|
|
|
getI32Imm(ME) };
|
2013-04-20 06:22:57 +08:00
|
|
|
return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops);
|
2005-08-19 08:38:14 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-08-22 02:50:37 +08:00
|
|
|
/// SelectCC - Select a comparison of the specified values with the specified
|
|
|
|
/// condition code, returning the CR# of the expression.
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
|
2013-05-25 10:42:55 +08:00
|
|
|
ISD::CondCode CC, SDLoc dl) {
|
2005-08-22 02:50:37 +08:00
|
|
|
// Always select the LHS.
|
2006-06-27 08:04:13 +08:00
|
|
|
unsigned Opc;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2009-08-12 04:47:22 +08:00
|
|
|
if (LHS.getValueType() == MVT::i32) {
|
2006-06-27 08:10:13 +08:00
|
|
|
unsigned Imm;
|
Two improvements:
1. Codegen this comparison:
if (X == 0x8000)
as:
cmplwi cr0, r3, 32768
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 0
ori r2, r2, 32768
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
2. Codegen this comparison:
if (X == 0x12345678)
as:
xoris r2, r3, 4660
cmplwi cr0, r2, 22136
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 4660
ori r2, r2, 22136
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
llvm-svn: 30509
2006-09-20 12:25:47 +08:00
|
|
|
if (CC == ISD::SETEQ || CC == ISD::SETNE) {
|
|
|
|
if (isInt32Immediate(RHS, Imm)) {
|
|
|
|
// SETEQ/SETNE comparison with 16-bit immediate, fold it.
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isUInt<16>(Imm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
|
|
|
|
getI32Imm(Imm & 0xFFFF)), 0);
|
Two improvements:
1. Codegen this comparison:
if (X == 0x8000)
as:
cmplwi cr0, r3, 32768
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 0
ori r2, r2, 32768
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
2. Codegen this comparison:
if (X == 0x12345678)
as:
xoris r2, r3, 4660
cmplwi cr0, r2, 22136
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 4660
ori r2, r2, 22136
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
llvm-svn: 30509
2006-09-20 12:25:47 +08:00
|
|
|
// If this is a 16-bit signed immediate, fold it.
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isInt<16>((int)Imm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
|
|
|
|
getI32Imm(Imm & 0xFFFF)), 0);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
Two improvements:
1. Codegen this comparison:
if (X == 0x8000)
as:
cmplwi cr0, r3, 32768
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 0
ori r2, r2, 32768
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
2. Codegen this comparison:
if (X == 0x12345678)
as:
xoris r2, r3, 4660
cmplwi cr0, r2, 22136
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 4660
ori r2, r2, 22136
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
llvm-svn: 30509
2006-09-20 12:25:47 +08:00
|
|
|
// For non-equality comparisons, the default code would materialize the
|
|
|
|
// constant, then compare against it, like this:
|
|
|
|
// lis r2, 4660
|
2010-12-24 12:28:06 +08:00
|
|
|
// ori r2, r2, 22136
|
Two improvements:
1. Codegen this comparison:
if (X == 0x8000)
as:
cmplwi cr0, r3, 32768
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 0
ori r2, r2, 32768
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
2. Codegen this comparison:
if (X == 0x12345678)
as:
xoris r2, r3, 4660
cmplwi cr0, r2, 22136
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 4660
ori r2, r2, 22136
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
llvm-svn: 30509
2006-09-20 12:25:47 +08:00
|
|
|
// cmpw cr0, r3, r2
|
|
|
|
// Since we are just comparing for equality, we can emit this instead:
|
|
|
|
// xoris r0,r3,0x1234
|
|
|
|
// cmplwi cr0,r0,0x5678
|
|
|
|
// beq cr0,L6
|
2009-09-26 02:54:59 +08:00
|
|
|
SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS,
|
|
|
|
getI32Imm(Imm >> 16)), 0);
|
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor,
|
|
|
|
getI32Imm(Imm & 0xFFFF)), 0);
|
Two improvements:
1. Codegen this comparison:
if (X == 0x8000)
as:
cmplwi cr0, r3, 32768
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 0
ori r2, r2, 32768
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
2. Codegen this comparison:
if (X == 0x12345678)
as:
xoris r2, r3, 4660
cmplwi cr0, r2, 22136
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 4660
ori r2, r2, 22136
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
llvm-svn: 30509
2006-09-20 12:25:47 +08:00
|
|
|
}
|
|
|
|
Opc = PPC::CMPLW;
|
|
|
|
} else if (ISD::isUnsignedIntSetCC(CC)) {
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
|
|
|
|
getI32Imm(Imm & 0xFFFF)), 0);
|
2006-06-27 08:04:13 +08:00
|
|
|
Opc = PPC::CMPLW;
|
|
|
|
} else {
|
|
|
|
short SImm;
|
|
|
|
if (isIntS16Immediate(RHS, SImm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
|
|
|
|
getI32Imm((int)SImm & 0xFFFF)),
|
2006-06-27 08:04:13 +08:00
|
|
|
0);
|
|
|
|
Opc = PPC::CMPW;
|
|
|
|
}
|
2009-08-12 04:47:22 +08:00
|
|
|
} else if (LHS.getValueType() == MVT::i64) {
|
2006-06-27 08:04:13 +08:00
|
|
|
uint64_t Imm;
|
2006-09-20 12:33:27 +08:00
|
|
|
if (CC == ISD::SETEQ || CC == ISD::SETNE) {
|
2008-08-29 05:40:38 +08:00
|
|
|
if (isInt64Immediate(RHS.getNode(), Imm)) {
|
2006-09-20 12:33:27 +08:00
|
|
|
// SETEQ/SETNE comparison with 16-bit immediate, fold it.
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isUInt<16>(Imm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
|
|
|
|
getI32Imm(Imm & 0xFFFF)), 0);
|
2006-09-20 12:33:27 +08:00
|
|
|
// If this is a 16-bit signed immediate, fold it.
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isInt<16>(Imm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
|
|
|
|
getI32Imm(Imm & 0xFFFF)), 0);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-09-20 12:33:27 +08:00
|
|
|
// For non-equality comparisons, the default code would materialize the
|
|
|
|
// constant, then compare against it, like this:
|
|
|
|
// lis r2, 4660
|
2010-12-24 12:28:06 +08:00
|
|
|
// ori r2, r2, 22136
|
2006-09-20 12:33:27 +08:00
|
|
|
// cmpd cr0, r3, r2
|
|
|
|
// Since we are just comparing for equality, we can emit this instead:
|
|
|
|
// xoris r0,r3,0x1234
|
|
|
|
// cmpldi cr0,r0,0x5678
|
|
|
|
// beq cr0,L6
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isUInt<32>(Imm)) {
|
2009-09-26 02:54:59 +08:00
|
|
|
SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS,
|
|
|
|
getI64Imm(Imm >> 16)), 0);
|
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor,
|
|
|
|
getI64Imm(Imm & 0xFFFF)), 0);
|
2006-09-20 12:33:27 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
Opc = PPC::CMPLD;
|
|
|
|
} else if (ISD::isUnsignedIntSetCC(CC)) {
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
|
|
|
|
getI64Imm(Imm & 0xFFFF)), 0);
|
2006-06-27 08:04:13 +08:00
|
|
|
Opc = PPC::CMPLD;
|
|
|
|
} else {
|
|
|
|
short SImm;
|
|
|
|
if (isIntS16Immediate(RHS, SImm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
|
|
|
|
getI64Imm(SImm & 0xFFFF)),
|
2006-06-27 08:04:13 +08:00
|
|
|
0);
|
|
|
|
Opc = PPC::CMPD;
|
|
|
|
}
|
2009-08-12 04:47:22 +08:00
|
|
|
} else if (LHS.getValueType() == MVT::f32) {
|
2006-06-27 08:04:13 +08:00
|
|
|
Opc = PPC::FCMPUS;
|
2005-08-22 02:50:37 +08:00
|
|
|
} else {
|
2009-08-12 04:47:22 +08:00
|
|
|
assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
Opc = PPCSubTarget.hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
|
2005-08-22 02:50:37 +08:00
|
|
|
}
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
|
2005-08-22 02:50:37 +08:00
|
|
|
}
|
|
|
|
|
2006-11-18 06:10:59 +08:00
|
|
|
static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) {
|
2005-08-22 02:50:37 +08:00
|
|
|
switch (CC) {
|
2006-05-26 00:54:16 +08:00
|
|
|
case ISD::SETUEQ:
|
2008-11-08 06:54:33 +08:00
|
|
|
case ISD::SETONE:
|
|
|
|
case ISD::SETOLE:
|
|
|
|
case ISD::SETOGE:
|
2009-07-15 00:55:14 +08:00
|
|
|
llvm_unreachable("Should be lowered by legalize!");
|
|
|
|
default: llvm_unreachable("Unknown condition!");
|
2008-11-08 06:54:33 +08:00
|
|
|
case ISD::SETOEQ:
|
2006-11-18 06:10:59 +08:00
|
|
|
case ISD::SETEQ: return PPC::PRED_EQ;
|
2006-05-26 00:54:16 +08:00
|
|
|
case ISD::SETUNE:
|
2006-11-18 06:10:59 +08:00
|
|
|
case ISD::SETNE: return PPC::PRED_NE;
|
2008-11-08 06:54:33 +08:00
|
|
|
case ISD::SETOLT:
|
2006-11-18 06:10:59 +08:00
|
|
|
case ISD::SETLT: return PPC::PRED_LT;
|
2005-08-22 02:50:37 +08:00
|
|
|
case ISD::SETULE:
|
2006-11-18 06:10:59 +08:00
|
|
|
case ISD::SETLE: return PPC::PRED_LE;
|
2008-11-08 06:54:33 +08:00
|
|
|
case ISD::SETOGT:
|
2006-11-18 06:10:59 +08:00
|
|
|
case ISD::SETGT: return PPC::PRED_GT;
|
2005-08-22 02:50:37 +08:00
|
|
|
case ISD::SETUGE:
|
2006-11-18 06:10:59 +08:00
|
|
|
case ISD::SETGE: return PPC::PRED_GE;
|
|
|
|
case ISD::SETO: return PPC::PRED_NU;
|
|
|
|
case ISD::SETUO: return PPC::PRED_UN;
|
2008-11-08 06:54:33 +08:00
|
|
|
// These two are invalid for floating point. Assume we have int.
|
|
|
|
case ISD::SETULT: return PPC::PRED_LT;
|
|
|
|
case ISD::SETUGT: return PPC::PRED_GT;
|
2005-08-22 02:50:37 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-08-26 04:08:18 +08:00
|
|
|
/// getCRIdxForSetCC - Return the index of the condition register field
|
|
|
|
/// associated with the SetCC condition, and whether or not the field is
|
|
|
|
/// treated as inverted. That is, lt = 0; ge = 0 inverted.
|
2013-07-03 23:13:30 +08:00
|
|
|
static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) {
|
2008-01-08 14:46:30 +08:00
|
|
|
Invert = false;
|
2005-08-26 04:08:18 +08:00
|
|
|
switch (CC) {
|
2009-07-15 00:55:14 +08:00
|
|
|
default: llvm_unreachable("Unknown condition!");
|
2008-01-08 14:46:30 +08:00
|
|
|
case ISD::SETOLT:
|
|
|
|
case ISD::SETLT: return 0; // Bit #0 = SETOLT
|
|
|
|
case ISD::SETOGT:
|
|
|
|
case ISD::SETGT: return 1; // Bit #1 = SETOGT
|
|
|
|
case ISD::SETOEQ:
|
|
|
|
case ISD::SETEQ: return 2; // Bit #2 = SETOEQ
|
|
|
|
case ISD::SETUO: return 3; // Bit #3 = SETUO
|
2005-08-26 04:08:18 +08:00
|
|
|
case ISD::SETUGE:
|
2008-01-08 14:46:30 +08:00
|
|
|
case ISD::SETGE: Invert = true; return 0; // !Bit #0 = SETUGE
|
2005-08-26 04:08:18 +08:00
|
|
|
case ISD::SETULE:
|
2008-01-08 14:46:30 +08:00
|
|
|
case ISD::SETLE: Invert = true; return 1; // !Bit #1 = SETULE
|
2006-05-26 02:06:16 +08:00
|
|
|
case ISD::SETUNE:
|
2008-01-08 14:46:30 +08:00
|
|
|
case ISD::SETNE: Invert = true; return 2; // !Bit #2 = SETUNE
|
|
|
|
case ISD::SETO: Invert = true; return 3; // !Bit #3 = SETO
|
2010-12-24 12:28:06 +08:00
|
|
|
case ISD::SETUEQ:
|
|
|
|
case ISD::SETOGE:
|
|
|
|
case ISD::SETOLE:
|
2008-11-08 06:54:33 +08:00
|
|
|
case ISD::SETONE:
|
2009-07-15 00:55:14 +08:00
|
|
|
llvm_unreachable("Invalid branch code: should be expanded by legalize");
|
2008-11-08 06:54:33 +08:00
|
|
|
// These are invalid for floating point. Assume integer.
|
|
|
|
case ISD::SETULT: return 0;
|
|
|
|
case ISD::SETUGT: return 1;
|
2005-08-26 04:08:18 +08:00
|
|
|
}
|
|
|
|
}
|
2005-08-22 06:31:09 +08:00
|
|
|
|
2012-10-30 21:50:19 +08:00
|
|
|
// getVCmpInst: return the vector compare instruction for the specified
|
|
|
|
// vector type and condition code. Since this is for altivec specific code,
|
|
|
|
// only support the altivec types (v16i8, v8i16, v4i32, and v4f32).
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
static unsigned int getVCmpInst(MVT::SimpleValueType VecVT, ISD::CondCode CC,
|
|
|
|
bool HasVSX) {
|
2012-10-30 21:50:19 +08:00
|
|
|
switch (CC) {
|
|
|
|
case ISD::SETEQ:
|
|
|
|
case ISD::SETUEQ:
|
|
|
|
case ISD::SETNE:
|
|
|
|
case ISD::SETUNE:
|
|
|
|
if (VecVT == MVT::v16i8)
|
|
|
|
return PPC::VCMPEQUB;
|
|
|
|
else if (VecVT == MVT::v8i16)
|
|
|
|
return PPC::VCMPEQUH;
|
|
|
|
else if (VecVT == MVT::v4i32)
|
|
|
|
return PPC::VCMPEQUW;
|
|
|
|
// v4f32 != v4f32 could be translate to unordered not equal
|
|
|
|
else if (VecVT == MVT::v4f32)
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
|
|
|
|
else if (VecVT == MVT::v2f64)
|
|
|
|
return PPC::XVCMPEQDP;
|
2012-10-30 21:50:19 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETLT:
|
|
|
|
case ISD::SETGT:
|
|
|
|
case ISD::SETLE:
|
|
|
|
case ISD::SETGE:
|
|
|
|
if (VecVT == MVT::v16i8)
|
|
|
|
return PPC::VCMPGTSB;
|
|
|
|
else if (VecVT == MVT::v8i16)
|
|
|
|
return PPC::VCMPGTSH;
|
|
|
|
else if (VecVT == MVT::v4i32)
|
|
|
|
return PPC::VCMPGTSW;
|
|
|
|
else if (VecVT == MVT::v4f32)
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP;
|
|
|
|
else if (VecVT == MVT::v2f64)
|
|
|
|
return PPC::XVCMPGTDP;
|
2012-10-30 21:50:19 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETULT:
|
|
|
|
case ISD::SETUGT:
|
|
|
|
case ISD::SETUGE:
|
|
|
|
case ISD::SETULE:
|
|
|
|
if (VecVT == MVT::v16i8)
|
|
|
|
return PPC::VCMPGTUB;
|
|
|
|
else if (VecVT == MVT::v8i16)
|
|
|
|
return PPC::VCMPGTUH;
|
|
|
|
else if (VecVT == MVT::v4i32)
|
|
|
|
return PPC::VCMPGTUW;
|
|
|
|
break;
|
|
|
|
case ISD::SETOEQ:
|
|
|
|
if (VecVT == MVT::v4f32)
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
|
|
|
|
else if (VecVT == MVT::v2f64)
|
|
|
|
return PPC::XVCMPEQDP;
|
2012-10-30 21:50:19 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETOLT:
|
|
|
|
case ISD::SETOGT:
|
|
|
|
case ISD::SETOLE:
|
|
|
|
if (VecVT == MVT::v4f32)
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP;
|
|
|
|
else if (VecVT == MVT::v2f64)
|
|
|
|
return PPC::XVCMPGTDP;
|
2012-10-30 21:50:19 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETOGE:
|
|
|
|
if (VecVT == MVT::v4f32)
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP;
|
|
|
|
else if (VecVT == MVT::v2f64)
|
|
|
|
return PPC::XVCMPGEDP;
|
2012-10-30 21:50:19 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
llvm_unreachable("Invalid integer vector compare condition");
|
|
|
|
}
|
|
|
|
|
|
|
|
// getVCmpEQInst: return the equal compare instruction for the specified vector
|
|
|
|
// type. Since this is for altivec specific code, only support the altivec
|
|
|
|
// types (v16i8, v8i16, v4i32, and v4f32).
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
static unsigned int getVCmpEQInst(MVT::SimpleValueType VecVT, bool HasVSX) {
|
2012-10-30 21:50:19 +08:00
|
|
|
switch (VecVT) {
|
|
|
|
case MVT::v16i8:
|
|
|
|
return PPC::VCMPEQUB;
|
|
|
|
case MVT::v8i16:
|
|
|
|
return PPC::VCMPEQUH;
|
|
|
|
case MVT::v4i32:
|
|
|
|
return PPC::VCMPEQUW;
|
|
|
|
case MVT::v4f32:
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
|
|
|
|
case MVT::v2f64:
|
|
|
|
return PPC::XVCMPEQDP;
|
2012-10-30 21:50:19 +08:00
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid integer vector compare condition");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-01-05 09:24:18 +08:00
|
|
|
SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(N);
|
2005-10-07 03:03:35 +08:00
|
|
|
unsigned Imm;
|
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
|
2011-06-20 23:28:39 +08:00
|
|
|
EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
|
|
|
|
bool isPPC64 = (PtrVT == MVT::i64);
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
if (!PPCSubTarget.useCRBits() &&
|
|
|
|
isInt32Immediate(N->getOperand(1), Imm)) {
|
2005-10-07 03:03:35 +08:00
|
|
|
// We can codegen setcc op, imm very efficiently compared to a brcond.
|
|
|
|
// Check for those cases here.
|
|
|
|
// setcc op, 0
|
|
|
|
if (Imm == 0) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Op = N->getOperand(0);
|
2005-10-07 03:03:35 +08:00
|
|
|
switch (CC) {
|
2005-10-22 05:17:10 +08:00
|
|
|
default: break;
|
2006-08-27 16:14:06 +08:00
|
|
|
case ISD::SETEQ: {
|
2009-09-26 02:54:59 +08:00
|
|
|
Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { Op, getI32Imm(27), getI32Imm(5), getI32Imm(31) };
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
|
2006-08-27 16:14:06 +08:00
|
|
|
}
|
2005-10-22 05:17:10 +08:00
|
|
|
case ISD::SETNE: {
|
2011-06-20 23:28:39 +08:00
|
|
|
if (isPPC64) break;
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue AD =
|
2010-12-21 10:38:05 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
|
2009-09-26 02:54:59 +08:00
|
|
|
Op, getI32Imm(~0U)), 0);
|
2010-12-24 12:28:06 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op,
|
2006-08-26 16:00:10 +08:00
|
|
|
AD.getValue(1));
|
2005-10-22 05:17:10 +08:00
|
|
|
}
|
2006-08-27 16:14:06 +08:00
|
|
|
case ISD::SETLT: {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
|
2006-08-27 16:14:06 +08:00
|
|
|
}
|
2005-10-22 05:17:10 +08:00
|
|
|
case ISD::SETGT: {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue T =
|
2009-09-26 02:54:59 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0);
|
|
|
|
T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { T, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
|
2005-10-22 05:17:10 +08:00
|
|
|
}
|
2005-10-07 03:03:35 +08:00
|
|
|
}
|
|
|
|
} else if (Imm == ~0U) { // setcc op, -1
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Op = N->getOperand(0);
|
2005-10-07 03:03:35 +08:00
|
|
|
switch (CC) {
|
2005-10-22 05:17:10 +08:00
|
|
|
default: break;
|
|
|
|
case ISD::SETEQ:
|
2011-06-20 23:28:39 +08:00
|
|
|
if (isPPC64) break;
|
2010-12-21 10:38:05 +08:00
|
|
|
Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
|
2009-09-26 02:54:59 +08:00
|
|
|
Op, getI32Imm(1)), 0);
|
2010-12-24 12:28:06 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
|
|
|
|
SDValue(CurDAG->getMachineNode(PPC::LI, dl,
|
2009-09-26 02:54:59 +08:00
|
|
|
MVT::i32,
|
|
|
|
getI32Imm(0)), 0),
|
2009-02-05 07:02:30 +08:00
|
|
|
Op.getValue(1));
|
2005-10-22 05:17:10 +08:00
|
|
|
case ISD::SETNE: {
|
2011-06-20 23:28:39 +08:00
|
|
|
if (isPPC64) break;
|
2009-09-26 02:54:59 +08:00
|
|
|
Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0);
|
2010-12-21 10:38:05 +08:00
|
|
|
SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
|
2009-09-26 02:54:59 +08:00
|
|
|
Op, getI32Imm(~0U));
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0),
|
2008-07-28 05:46:04 +08:00
|
|
|
Op, SDValue(AD, 1));
|
2005-10-22 05:17:10 +08:00
|
|
|
}
|
|
|
|
case ISD::SETLT: {
|
2009-09-26 02:54:59 +08:00
|
|
|
SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op,
|
|
|
|
getI32Imm(1)), 0);
|
|
|
|
SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD,
|
|
|
|
Op), 0);
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { AN, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
|
2005-10-22 05:17:10 +08:00
|
|
|
}
|
2006-08-27 16:14:06 +08:00
|
|
|
case ISD::SETGT: {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
|
2013-04-20 06:22:57 +08:00
|
|
|
Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops),
|
2009-02-05 07:02:30 +08:00
|
|
|
0);
|
2010-12-24 12:28:06 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op,
|
2006-08-26 16:00:10 +08:00
|
|
|
getI32Imm(1));
|
2005-10-07 03:03:35 +08:00
|
|
|
}
|
2006-08-27 16:14:06 +08:00
|
|
|
}
|
2005-10-07 03:03:35 +08:00
|
|
|
}
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2012-10-09 02:59:53 +08:00
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
|
2012-10-30 21:50:19 +08:00
|
|
|
// Altivec Vector compare instructions do not set any CR register by default and
|
|
|
|
// vector compare operations return the same type as the operands.
|
2012-10-09 02:59:53 +08:00
|
|
|
if (LHS.getValueType().isVector()) {
|
2012-10-30 21:50:19 +08:00
|
|
|
EVT VecVT = LHS.getValueType();
|
|
|
|
MVT::SimpleValueType VT = VecVT.getSimpleVT().SimpleTy;
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
unsigned int VCmpInst = getVCmpInst(VT, CC, PPCSubTarget.hasVSX());
|
2012-10-30 21:50:19 +08:00
|
|
|
|
|
|
|
switch (CC) {
|
|
|
|
case ISD::SETEQ:
|
|
|
|
case ISD::SETOEQ:
|
|
|
|
case ISD::SETUEQ:
|
|
|
|
return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
|
|
|
|
case ISD::SETNE:
|
|
|
|
case ISD::SETONE:
|
|
|
|
case ISD::SETUNE: {
|
|
|
|
SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
|
|
|
|
return CurDAG->SelectNodeTo(N, PPC::VNOR, VecVT, VCmp, VCmp);
|
|
|
|
}
|
|
|
|
case ISD::SETLT:
|
|
|
|
case ISD::SETOLT:
|
|
|
|
case ISD::SETULT:
|
|
|
|
return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, RHS, LHS);
|
|
|
|
case ISD::SETGT:
|
|
|
|
case ISD::SETOGT:
|
|
|
|
case ISD::SETUGT:
|
|
|
|
return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
|
|
|
|
case ISD::SETGE:
|
|
|
|
case ISD::SETOGE:
|
|
|
|
case ISD::SETUGE: {
|
|
|
|
// Small optimization: Altivec provides a 'Vector Compare Greater Than
|
|
|
|
// or Equal To' instruction (vcmpgefp), so in this case there is no
|
|
|
|
// need for extra logic for the equal compare.
|
|
|
|
if (VecVT.getSimpleVT().isFloatingPoint()) {
|
|
|
|
return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
|
|
|
|
} else {
|
|
|
|
SDValue VCmpGT(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget.hasVSX());
|
2012-10-30 21:50:19 +08:00
|
|
|
SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
|
|
|
|
return CurDAG->SelectNodeTo(N, PPC::VOR, VecVT, VCmpGT, VCmpEQ);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
case ISD::SETLE:
|
|
|
|
case ISD::SETOLE:
|
|
|
|
case ISD::SETULE: {
|
|
|
|
SDValue VCmpLE(CurDAG->getMachineNode(VCmpInst, dl, VecVT, RHS, LHS), 0);
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget.hasVSX());
|
2012-10-30 21:50:19 +08:00
|
|
|
SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
|
|
|
|
return CurDAG->SelectNodeTo(N, PPC::VOR, VecVT, VCmpLE, VCmpEQ);
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid vector compare type: should be expanded by legalize");
|
|
|
|
}
|
2012-10-09 02:59:53 +08:00
|
|
|
}
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
if (PPCSubTarget.useCRBits())
|
|
|
|
return 0;
|
|
|
|
|
2005-10-07 03:03:35 +08:00
|
|
|
bool Inv;
|
2013-07-03 23:13:30 +08:00
|
|
|
unsigned Idx = getCRIdxForSetCC(CC, Inv);
|
2012-10-09 02:59:53 +08:00
|
|
|
SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue IntCR;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-10-07 03:03:35 +08:00
|
|
|
// Force the ccreg into CR7.
|
2009-08-12 04:47:22 +08:00
|
|
|
SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue InFlag(0, 0); // Null incoming flag value.
|
2010-12-24 12:28:06 +08:00
|
|
|
CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg,
|
2005-12-01 11:50:19 +08:00
|
|
|
InFlag).getValue(1);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
llvm-svn: 185556
2013-07-04 01:05:42 +08:00
|
|
|
IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
|
|
|
|
CCReg), 0);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { IntCR, getI32Imm((32-(3-Idx)) & 31),
|
2006-08-27 16:14:06 +08:00
|
|
|
getI32Imm(31), getI32Imm(31) };
|
2013-07-03 23:13:30 +08:00
|
|
|
if (!Inv)
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
|
2008-01-08 14:46:30 +08:00
|
|
|
|
|
|
|
// Get the specified bit.
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Tmp =
|
2013-04-20 06:22:57 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
|
2013-07-03 23:13:30 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1));
|
2005-10-07 03:03:35 +08:00
|
|
|
}
|
2005-10-07 02:56:10 +08:00
|
|
|
|
2005-10-07 03:07:45 +08:00
|
|
|
|
2005-08-18 03:33:03 +08:00
|
|
|
// Select - Convert the specified operand from a target-independent to a
|
|
|
|
// target-specific node if it hasn't already been changed.
|
2010-01-05 09:24:18 +08:00
|
|
|
SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(N);
|
2013-09-22 16:21:56 +08:00
|
|
|
if (N->isMachineOpcode()) {
|
|
|
|
N->setNodeId(-1);
|
2006-08-11 17:08:15 +08:00
|
|
|
return NULL; // Already selected.
|
2013-09-22 16:21:56 +08:00
|
|
|
}
|
2005-09-29 08:59:32 +08:00
|
|
|
|
2005-08-18 03:33:03 +08:00
|
|
|
switch (N->getOpcode()) {
|
2005-09-08 07:45:15 +08:00
|
|
|
default: break;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-12-12 21:23:43 +08:00
|
|
|
case ISD::Constant: {
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getValueType(0) == MVT::i64) {
|
2006-12-12 21:23:43 +08:00
|
|
|
// Get 64 bit value.
|
2008-09-13 00:56:44 +08:00
|
|
|
int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
|
2006-12-12 21:23:43 +08:00
|
|
|
// Assume no remaining bits.
|
|
|
|
unsigned Remainder = 0;
|
|
|
|
// Assume no shift required.
|
|
|
|
unsigned Shift = 0;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-12-12 21:23:43 +08:00
|
|
|
// If it can't be represented as a 32 bit value.
|
2010-03-30 05:13:41 +08:00
|
|
|
if (!isInt<32>(Imm)) {
|
2013-05-25 06:23:49 +08:00
|
|
|
Shift = countTrailingZeros<uint64_t>(Imm);
|
2006-12-12 21:23:43 +08:00
|
|
|
int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-12-12 21:23:43 +08:00
|
|
|
// If the shifted value fits 32 bits.
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isInt<32>(ImmSh)) {
|
2006-12-12 21:23:43 +08:00
|
|
|
// Go with the shifted value.
|
|
|
|
Imm = ImmSh;
|
|
|
|
} else {
|
|
|
|
// Still stuck with a 64 bit value.
|
|
|
|
Remainder = Imm;
|
|
|
|
Shift = 32;
|
|
|
|
Imm >>= 32;
|
|
|
|
}
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-12-12 21:23:43 +08:00
|
|
|
// Intermediate operand.
|
|
|
|
SDNode *Result;
|
|
|
|
|
|
|
|
// Handle first 32 bits.
|
|
|
|
unsigned Lo = Imm & 0xFFFF;
|
|
|
|
unsigned Hi = (Imm >> 16) & 0xFFFF;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-12-12 21:23:43 +08:00
|
|
|
// Simple value.
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isInt<16>(Imm)) {
|
2006-12-12 21:23:43 +08:00
|
|
|
// Just the Lo bits.
|
2009-09-26 02:54:59 +08:00
|
|
|
Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
|
2006-12-12 21:23:43 +08:00
|
|
|
} else if (Lo) {
|
|
|
|
// Handle the Hi bits.
|
|
|
|
unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
|
2009-09-26 02:54:59 +08:00
|
|
|
Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
|
2006-12-12 21:23:43 +08:00
|
|
|
// And Lo bits.
|
2009-09-26 02:54:59 +08:00
|
|
|
Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
|
|
|
|
SDValue(Result, 0), getI32Imm(Lo));
|
2006-12-12 21:23:43 +08:00
|
|
|
} else {
|
|
|
|
// Just the Hi bits.
|
2009-09-26 02:54:59 +08:00
|
|
|
Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
|
2006-12-12 21:23:43 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-12-12 21:23:43 +08:00
|
|
|
// If no shift, we're done.
|
|
|
|
if (!Shift) return Result;
|
|
|
|
|
|
|
|
// Shift for next step if the upper 32-bits were not zero.
|
|
|
|
if (Imm) {
|
2009-09-26 02:54:59 +08:00
|
|
|
Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
|
|
|
|
SDValue(Result, 0),
|
|
|
|
getI32Imm(Shift),
|
|
|
|
getI32Imm(63 - Shift));
|
2006-12-12 21:23:43 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Add in the last bits as required.
|
|
|
|
if ((Hi = (Remainder >> 16) & 0xFFFF)) {
|
2009-09-26 02:54:59 +08:00
|
|
|
Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
|
|
|
|
SDValue(Result, 0), getI32Imm(Hi));
|
2010-12-24 12:28:06 +08:00
|
|
|
}
|
2006-12-12 21:23:43 +08:00
|
|
|
if ((Lo = Remainder & 0xFFFF)) {
|
2009-09-26 02:54:59 +08:00
|
|
|
Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
|
|
|
|
SDValue(Result, 0), getI32Imm(Lo));
|
2006-12-12 21:23:43 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-12-12 21:23:43 +08:00
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
case ISD::SETCC: {
|
|
|
|
SDNode *SN = SelectSETCC(N);
|
|
|
|
if (SN)
|
|
|
|
return SN;
|
|
|
|
break;
|
|
|
|
}
|
2006-02-09 08:37:58 +08:00
|
|
|
case PPCISD::GlobalBaseReg:
|
2006-08-26 13:34:46 +08:00
|
|
|
return getGlobalBaseReg();
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-08-25 08:45:43 +08:00
|
|
|
case ISD::FrameIndex: {
|
|
|
|
int FI = cast<FrameIndexSDNode>(N)->getIndex();
|
2010-01-05 09:24:18 +08:00
|
|
|
SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
|
|
|
|
unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8;
|
2006-08-16 07:48:22 +08:00
|
|
|
if (N->hasOneUse())
|
2010-01-05 09:24:18 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, Opc, N->getValueType(0), TFI,
|
2006-08-26 16:00:10 +08:00
|
|
|
getSmallIPtrImm(0));
|
2010-01-05 09:24:18 +08:00
|
|
|
return CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI,
|
2009-09-26 02:54:59 +08:00
|
|
|
getSmallIPtrImm(0));
|
2005-08-25 08:45:43 +08:00
|
|
|
}
|
2006-03-26 18:06:40 +08:00
|
|
|
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
llvm-svn: 185556
2013-07-04 01:05:42 +08:00
|
|
|
case PPCISD::MFOCRF: {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue InFlag = N->getOperand(1);
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
llvm-svn: 185556
2013-07-04 01:05:42 +08:00
|
|
|
return CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32,
|
|
|
|
N->getOperand(0), InFlag);
|
2006-03-26 18:06:40 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-09-29 06:50:24 +08:00
|
|
|
case ISD::SDIV: {
|
2005-10-21 08:02:42 +08:00
|
|
|
// FIXME: since this depends on the setting of the carry flag from the srawi
|
|
|
|
// we should really be making notes about that for the scheduler.
|
2010-12-24 12:28:06 +08:00
|
|
|
// FIXME: It sure would be nice if we could cheaply recognize the
|
2005-10-21 08:02:42 +08:00
|
|
|
// srl/add/sra pattern the dag combiner will generate for this as
|
|
|
|
// sra/addze rather than having to handle sdiv ourselves. oh well.
|
2005-08-26 01:50:06 +08:00
|
|
|
unsigned Imm;
|
2006-06-27 08:04:13 +08:00
|
|
|
if (isInt32Immediate(N->getOperand(1), Imm)) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue N0 = N->getOperand(0);
|
2005-08-26 01:50:06 +08:00
|
|
|
if ((signed)Imm > 0 && isPowerOf2_32(Imm)) {
|
2006-02-09 15:17:49 +08:00
|
|
|
SDNode *Op =
|
2010-12-21 10:38:05 +08:00
|
|
|
CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue,
|
2009-09-26 02:54:59 +08:00
|
|
|
N0, getI32Imm(Log2_32(Imm)));
|
2010-12-24 12:28:06 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue(Op, 0), SDValue(Op, 1));
|
2005-08-26 01:50:06 +08:00
|
|
|
} else if ((signed)Imm < 0 && isPowerOf2_32(-Imm)) {
|
2006-02-09 15:17:49 +08:00
|
|
|
SDNode *Op =
|
2010-12-21 10:38:05 +08:00
|
|
|
CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue,
|
2009-09-26 02:54:59 +08:00
|
|
|
N0, getI32Imm(Log2_32(-Imm)));
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue PT =
|
2009-09-26 02:54:59 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(PPC::ADDZE, dl, MVT::i32,
|
|
|
|
SDValue(Op, 0), SDValue(Op, 1)),
|
2006-02-09 15:17:49 +08:00
|
|
|
0);
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::NEG, MVT::i32, PT);
|
2005-08-26 01:50:06 +08:00
|
|
|
}
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-09-30 07:33:31 +08:00
|
|
|
// Other cases are autogenerated.
|
|
|
|
break;
|
2005-08-26 06:04:30 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-11-10 10:08:47 +08:00
|
|
|
case ISD::LOAD: {
|
|
|
|
// Handle preincrement loads.
|
2010-01-05 09:24:18 +08:00
|
|
|
LoadSDNode *LD = cast<LoadSDNode>(N);
|
2009-08-11 06:56:29 +08:00
|
|
|
EVT LoadedVT = LD->getMemoryVT();
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-11-10 10:08:47 +08:00
|
|
|
// Normal loads are handled by code generated from the .td file.
|
|
|
|
if (LD->getAddressingMode() != ISD::PRE_INC)
|
|
|
|
break;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Offset = LD->getOffset();
|
2013-03-22 22:58:17 +08:00
|
|
|
if (Offset.getOpcode() == ISD::TargetConstant ||
|
allow the offset of a preinc'd load to be the low-part of a global. This
produces this clever code:
_millisecs:
lis r2, ha16(_Time.1182)
lwzu r3, lo16(_Time.1182)(r2)
lwz r2, 4(r2)
addic r4, r2, 1
addze r3, r3
blr
instead of this:
_millisecs:
lis r2, ha16(_Time.1182)
la r3, lo16(_Time.1182)(r2)
lwz r2, lo16(_Time.1182)(r2)
lwz r3, 4(r3)
addic r4, r3, 1
addze r3, r2
blr
for:
long %millisecs() {
%tmp = load long* %Time.1182 ; <long> [#uses=1]
%tmp1 = add long %tmp, 1 ; <long> [#uses=1]
ret long %tmp1
}
llvm-svn: 31673
2006-11-11 12:53:30 +08:00
|
|
|
Offset.getOpcode() == ISD::TargetGlobalAddress) {
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-11-16 03:55:13 +08:00
|
|
|
unsigned Opcode;
|
|
|
|
bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
|
2009-08-12 04:47:22 +08:00
|
|
|
if (LD->getValueType(0) != MVT::i64) {
|
2006-11-16 03:55:13 +08:00
|
|
|
// Handle PPC32 integer and normal FP loads.
|
2009-08-12 04:47:22 +08:00
|
|
|
assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
|
|
|
|
switch (LoadedVT.getSimpleVT().SimpleTy) {
|
2009-07-15 00:55:14 +08:00
|
|
|
default: llvm_unreachable("Invalid PPC load type!");
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::f64: Opcode = PPC::LFDU; break;
|
|
|
|
case MVT::f32: Opcode = PPC::LFSU; break;
|
|
|
|
case MVT::i32: Opcode = PPC::LWZU; break;
|
|
|
|
case MVT::i16: Opcode = isSExt ? PPC::LHAU : PPC::LHZU; break;
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opcode = PPC::LBZU; break;
|
2006-11-16 03:55:13 +08:00
|
|
|
}
|
|
|
|
} else {
|
2009-08-12 04:47:22 +08:00
|
|
|
assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
|
|
|
|
assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
|
|
|
|
switch (LoadedVT.getSimpleVT().SimpleTy) {
|
2009-07-15 00:55:14 +08:00
|
|
|
default: llvm_unreachable("Invalid PPC load type!");
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i64: Opcode = PPC::LDU; break;
|
|
|
|
case MVT::i32: Opcode = PPC::LWZU8; break;
|
|
|
|
case MVT::i16: Opcode = isSExt ? PPC::LHAU8 : PPC::LHZU8; break;
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opcode = PPC::LBZU8; break;
|
2006-11-16 03:55:13 +08:00
|
|
|
}
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Chain = LD->getChain();
|
|
|
|
SDValue Base = LD->getBasePtr();
|
|
|
|
SDValue Ops[] = { Offset, Base, Chain };
|
2009-09-26 02:54:59 +08:00
|
|
|
return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
|
|
|
|
PPCLowering.getPointerTy(),
|
2013-04-20 06:22:57 +08:00
|
|
|
MVT::Other, Ops);
|
2006-11-10 10:08:47 +08:00
|
|
|
} else {
|
2012-06-20 23:43:03 +08:00
|
|
|
unsigned Opcode;
|
|
|
|
bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
|
|
|
|
if (LD->getValueType(0) != MVT::i64) {
|
|
|
|
// Handle PPC32 integer and normal FP loads.
|
|
|
|
assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
|
|
|
|
switch (LoadedVT.getSimpleVT().SimpleTy) {
|
|
|
|
default: llvm_unreachable("Invalid PPC load type!");
|
|
|
|
case MVT::f64: Opcode = PPC::LFDUX; break;
|
|
|
|
case MVT::f32: Opcode = PPC::LFSUX; break;
|
|
|
|
case MVT::i32: Opcode = PPC::LWZUX; break;
|
|
|
|
case MVT::i16: Opcode = isSExt ? PPC::LHAUX : PPC::LHZUX; break;
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opcode = PPC::LBZUX; break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
|
|
|
|
assert((!isSExt || LoadedVT == MVT::i16 || LoadedVT == MVT::i32) &&
|
|
|
|
"Invalid sext update load");
|
|
|
|
switch (LoadedVT.getSimpleVT().SimpleTy) {
|
|
|
|
default: llvm_unreachable("Invalid PPC load type!");
|
|
|
|
case MVT::i64: Opcode = PPC::LDUX; break;
|
|
|
|
case MVT::i32: Opcode = isSExt ? PPC::LWAUX : PPC::LWZUX8; break;
|
|
|
|
case MVT::i16: Opcode = isSExt ? PPC::LHAUX8 : PPC::LHZUX8; break;
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opcode = PPC::LBZUX8; break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue Chain = LD->getChain();
|
|
|
|
SDValue Base = LD->getBasePtr();
|
2013-03-22 22:58:48 +08:00
|
|
|
SDValue Ops[] = { Base, Offset, Chain };
|
2012-06-20 23:43:03 +08:00
|
|
|
return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
|
|
|
|
PPCLowering.getPointerTy(),
|
2013-04-20 06:22:57 +08:00
|
|
|
MVT::Other, Ops);
|
2006-11-10 10:08:47 +08:00
|
|
|
}
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-08-18 15:30:46 +08:00
|
|
|
case ISD::AND: {
|
2006-09-22 13:01:56 +08:00
|
|
|
unsigned Imm, Imm2, SH, MB, ME;
|
2012-08-28 10:10:15 +08:00
|
|
|
uint64_t Imm64;
|
2006-09-22 13:01:56 +08:00
|
|
|
|
2005-08-18 15:30:46 +08:00
|
|
|
// If this is an and of a value rotated between 0 and 31 bits and then and'd
|
|
|
|
// with a mask, emit rlwinm
|
2006-06-27 08:04:13 +08:00
|
|
|
if (isInt32Immediate(N->getOperand(1), Imm) &&
|
2008-08-29 05:40:38 +08:00
|
|
|
isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Val = N->getOperand(0).getOperand(0);
|
|
|
|
SDValue Ops[] = { Val, getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
|
2005-08-18 15:30:46 +08:00
|
|
|
}
|
2006-09-22 13:01:56 +08:00
|
|
|
// If this is just a masked value where the input is not handled above, and
|
|
|
|
// is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
|
|
|
|
if (isInt32Immediate(N->getOperand(1), Imm) &&
|
2010-12-24 12:28:06 +08:00
|
|
|
isRunOfOnes(Imm, MB, ME) &&
|
2006-09-22 13:01:56 +08:00
|
|
|
N->getOperand(0).getOpcode() != ISD::ROTL) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Val = N->getOperand(0);
|
|
|
|
SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB), getI32Imm(ME) };
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
|
2006-09-22 13:01:56 +08:00
|
|
|
}
|
2012-08-28 10:10:15 +08:00
|
|
|
// If this is a 64-bit zero-extension mask, emit rldicl.
|
|
|
|
if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
|
|
|
|
isMask_64(Imm64)) {
|
|
|
|
SDValue Val = N->getOperand(0);
|
|
|
|
MB = 64 - CountTrailingOnes_64(Imm64);
|
PPC: Optimize rldicl generation for masked shifts
Masking operations (where only some number of the low bits are being kept) are
selected to rldicl(x, 0, mb). If x is a logical right shift (which would become
rldicl(y, 64-n, n)), we might be able to fold the two instructions together:
rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb) for n <= mb
The right shift is really a left rotate followed by a mask, and if the explicit
mask is a more-restrictive sub-mask of the mask implied by the shift, only one
rldicl is needed.
llvm-svn: 195185
2013-11-20 09:10:15 +08:00
|
|
|
SH = 0;
|
|
|
|
|
|
|
|
// If the operand is a logical right shift, we can fold it into this
|
|
|
|
// instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb)
|
|
|
|
// for n <= mb. The right shift is really a left rotate followed by a
|
|
|
|
// mask, and this mask is a more-restrictive sub-mask of the mask implied
|
|
|
|
// by the shift.
|
|
|
|
if (Val.getOpcode() == ISD::SRL &&
|
|
|
|
isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) {
|
|
|
|
assert(Imm < 64 && "Illegal shift amount");
|
|
|
|
Val = Val.getOperand(0);
|
|
|
|
SH = 64 - Imm;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue Ops[] = { Val, getI32Imm(SH), getI32Imm(MB) };
|
2012-08-28 10:10:15 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops, 3);
|
|
|
|
}
|
2006-09-22 13:01:56 +08:00
|
|
|
// AND X, 0 -> 0, not "rlwinm 32".
|
|
|
|
if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
|
2008-07-28 05:46:04 +08:00
|
|
|
ReplaceUses(SDValue(N, 0), N->getOperand(1));
|
2006-09-22 13:01:56 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
2005-12-24 09:00:15 +08:00
|
|
|
// ISD::OR doesn't get all the bitfield insertion fun.
|
|
|
|
// (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) is a bitfield insert
|
2010-12-24 12:28:06 +08:00
|
|
|
if (isInt32Immediate(N->getOperand(1), Imm) &&
|
2005-12-24 09:00:15 +08:00
|
|
|
N->getOperand(0).getOpcode() == ISD::OR &&
|
2006-06-27 08:04:13 +08:00
|
|
|
isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) {
|
2006-01-06 02:32:49 +08:00
|
|
|
unsigned MB, ME;
|
2005-12-24 09:00:15 +08:00
|
|
|
Imm = ~(Imm^Imm2);
|
|
|
|
if (isRunOfOnes(Imm, MB, ME)) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { N->getOperand(0).getOperand(0),
|
2006-08-27 16:14:06 +08:00
|
|
|
N->getOperand(0).getOperand(1),
|
|
|
|
getI32Imm(0), getI32Imm(MB),getI32Imm(ME) };
|
2013-04-20 06:22:57 +08:00
|
|
|
return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops);
|
2005-12-24 09:00:15 +08:00
|
|
|
}
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-09-30 07:33:31 +08:00
|
|
|
// Other cases are autogenerated.
|
|
|
|
break;
|
2005-08-18 15:30:46 +08:00
|
|
|
}
|
2005-08-19 08:38:14 +08:00
|
|
|
case ISD::OR:
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getValueType(0) == MVT::i32)
|
2006-08-16 07:48:22 +08:00
|
|
|
if (SDNode *I = SelectBitfieldInsert(N))
|
|
|
|
return I;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-09-30 07:33:31 +08:00
|
|
|
// Other cases are autogenerated.
|
|
|
|
break;
|
2005-08-19 07:38:00 +08:00
|
|
|
case ISD::SHL: {
|
|
|
|
unsigned Imm, SH, MB, ME;
|
2008-08-29 05:40:38 +08:00
|
|
|
if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
|
2005-10-20 02:42:01 +08:00
|
|
|
isRotateAndMask(N, Imm, true, SH, MB, ME)) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { N->getOperand(0).getOperand(0),
|
2006-08-27 16:14:06 +08:00
|
|
|
getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
|
Woo, it kinda works. We now generate this atrociously bad, but correct,
code for long long foo(long long a, long long b) { return a + b; }
_foo:
or r2, r3, r3
or r3, r4, r4
or r4, r5, r5
or r5, r6, r6
rldicr r2, r2, 32, 31
rldicl r3, r3, 0, 32
rldicr r4, r4, 32, 31
rldicl r5, r5, 0, 32
or r2, r3, r2
or r3, r5, r4
add r4, r3, r2
rldicl r2, r4, 32, 32
or r4, r4, r4
or r3, r2, r2
blr
llvm-svn: 23809
2005-10-19 09:12:32 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-10-20 02:42:01 +08:00
|
|
|
// Other cases are autogenerated.
|
|
|
|
break;
|
2005-08-19 07:38:00 +08:00
|
|
|
}
|
|
|
|
case ISD::SRL: {
|
|
|
|
unsigned Imm, SH, MB, ME;
|
2008-08-29 05:40:38 +08:00
|
|
|
if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
|
2010-12-24 12:28:06 +08:00
|
|
|
isRotateAndMask(N, Imm, true, SH, MB, ME)) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { N->getOperand(0).getOperand(0),
|
2006-08-27 16:14:06 +08:00
|
|
|
getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
|
Woo, it kinda works. We now generate this atrociously bad, but correct,
code for long long foo(long long a, long long b) { return a + b; }
_foo:
or r2, r3, r3
or r3, r4, r4
or r4, r5, r5
or r5, r6, r6
rldicr r2, r2, 32, 31
rldicl r3, r3, 0, 32
rldicr r4, r4, 32, 31
rldicl r5, r5, 0, 32
or r2, r3, r2
or r3, r5, r4
add r4, r3, r2
rldicl r2, r4, 32, 32
or r4, r4, r4
or r3, r2, r2
blr
llvm-svn: 23809
2005-10-19 09:12:32 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-10-20 02:42:01 +08:00
|
|
|
// Other cases are autogenerated.
|
|
|
|
break;
|
2005-08-19 07:38:00 +08:00
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
// FIXME: Remove this once the ANDI glue bug is fixed:
|
|
|
|
case PPCISD::ANDIo_1_EQ_BIT:
|
|
|
|
case PPCISD::ANDIo_1_GT_BIT: {
|
|
|
|
if (!ANDIGlueBug)
|
|
|
|
break;
|
|
|
|
|
|
|
|
EVT InVT = N->getOperand(0).getValueType();
|
|
|
|
assert((InVT == MVT::i64 || InVT == MVT::i32) &&
|
|
|
|
"Invalid input type for ANDIo_1_EQ_BIT");
|
|
|
|
|
|
|
|
unsigned Opcode = (InVT == MVT::i64) ? PPC::ANDIo8 : PPC::ANDIo;
|
|
|
|
SDValue AndI(CurDAG->getMachineNode(Opcode, dl, InVT, MVT::Glue,
|
|
|
|
N->getOperand(0),
|
|
|
|
CurDAG->getTargetConstant(1, InVT)), 0);
|
|
|
|
SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32);
|
|
|
|
SDValue SRIdxVal =
|
|
|
|
CurDAG->getTargetConstant(N->getOpcode() == PPCISD::ANDIo_1_EQ_BIT ?
|
|
|
|
PPC::sub_eq : PPC::sub_gt, MVT::i32);
|
|
|
|
|
|
|
|
return CurDAG->SelectNodeTo(N, TargetOpcode::EXTRACT_SUBREG, MVT::i1,
|
|
|
|
CR0Reg, SRIdxVal,
|
|
|
|
SDValue(AndI.getNode(), 1) /* glue */);
|
|
|
|
}
|
2005-08-27 02:46:49 +08:00
|
|
|
case ISD::SELECT_CC: {
|
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
|
2011-06-20 23:28:39 +08:00
|
|
|
EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
|
|
|
|
bool isPPC64 = (PtrVT == MVT::i64);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
// If this is a select of i1 operands, we'll pattern match it.
|
|
|
|
if (PPCSubTarget.useCRBits() &&
|
|
|
|
N->getOperand(0).getValueType() == MVT::i1)
|
|
|
|
break;
|
|
|
|
|
2006-06-27 08:04:13 +08:00
|
|
|
// Handle the setcc cases here. select_cc lhs, 0, 1, 0, cc
|
2011-06-20 23:28:39 +08:00
|
|
|
if (!isPPC64)
|
|
|
|
if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
|
|
|
|
if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
|
|
|
|
if (ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
|
|
|
|
if (N1C->isNullValue() && N3C->isNullValue() &&
|
|
|
|
N2C->getZExtValue() == 1ULL && CC == ISD::SETNE &&
|
|
|
|
// FIXME: Implement this optzn for PPC64.
|
|
|
|
N->getValueType(0) == MVT::i32) {
|
|
|
|
SDNode *Tmp =
|
|
|
|
CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
|
|
|
|
N->getOperand(0), getI32Imm(~0U));
|
|
|
|
return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32,
|
|
|
|
SDValue(Tmp, 0), N->getOperand(0),
|
|
|
|
SDValue(Tmp, 1));
|
|
|
|
}
|
2005-08-27 05:23:58 +08:00
|
|
|
|
2009-02-07 03:16:40 +08:00
|
|
|
SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl);
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
|
|
|
|
if (N->getValueType(0) == MVT::i1) {
|
|
|
|
// An i1 select is: (c & t) | (!c & f).
|
|
|
|
bool Inv;
|
|
|
|
unsigned Idx = getCRIdxForSetCC(CC, Inv);
|
|
|
|
|
|
|
|
unsigned SRI;
|
|
|
|
switch (Idx) {
|
|
|
|
default: llvm_unreachable("Invalid CC index");
|
|
|
|
case 0: SRI = PPC::sub_lt; break;
|
|
|
|
case 1: SRI = PPC::sub_gt; break;
|
|
|
|
case 2: SRI = PPC::sub_eq; break;
|
|
|
|
case 3: SRI = PPC::sub_un; break;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue CCBit = CurDAG->getTargetExtractSubreg(SRI, dl, MVT::i1, CCReg);
|
|
|
|
|
|
|
|
SDValue NotCCBit(CurDAG->getMachineNode(PPC::CRNOR, dl, MVT::i1,
|
|
|
|
CCBit, CCBit), 0);
|
|
|
|
SDValue C = Inv ? NotCCBit : CCBit,
|
|
|
|
NotC = Inv ? CCBit : NotCCBit;
|
|
|
|
|
|
|
|
SDValue CAndT(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1,
|
|
|
|
C, N->getOperand(2)), 0);
|
|
|
|
SDValue NotCAndF(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1,
|
|
|
|
NotC, N->getOperand(3)), 0);
|
|
|
|
|
|
|
|
return CurDAG->SelectNodeTo(N, PPC::CROR, MVT::i1, CAndT, NotCAndF);
|
|
|
|
}
|
|
|
|
|
2006-11-18 06:10:59 +08:00
|
|
|
unsigned BROpc = getPredicateForSetCC(CC);
|
2005-08-27 05:23:58 +08:00
|
|
|
|
2005-10-01 09:35:02 +08:00
|
|
|
unsigned SelectCCOp;
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getValueType(0) == MVT::i32)
|
2006-06-27 08:04:13 +08:00
|
|
|
SelectCCOp = PPC::SELECT_CC_I4;
|
2009-08-12 04:47:22 +08:00
|
|
|
else if (N->getValueType(0) == MVT::i64)
|
2006-06-27 08:04:13 +08:00
|
|
|
SelectCCOp = PPC::SELECT_CC_I8;
|
2009-08-12 04:47:22 +08:00
|
|
|
else if (N->getValueType(0) == MVT::f32)
|
2005-10-01 09:35:02 +08:00
|
|
|
SelectCCOp = PPC::SELECT_CC_F4;
|
2009-08-12 04:47:22 +08:00
|
|
|
else if (N->getValueType(0) == MVT::f64)
|
2005-10-01 09:35:02 +08:00
|
|
|
SelectCCOp = PPC::SELECT_CC_F8;
|
2006-04-09 06:45:08 +08:00
|
|
|
else
|
|
|
|
SelectCCOp = PPC::SELECT_CC_VRRC;
|
|
|
|
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { CCReg, N->getOperand(2), N->getOperand(3),
|
2006-08-27 16:14:06 +08:00
|
|
|
getI32Imm(BROpc) };
|
|
|
|
return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops, 4);
|
2005-08-27 02:46:49 +08:00
|
|
|
}
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
llvm-svn: 181927
2013-05-16 05:37:41 +08:00
|
|
|
case PPCISD::BDNZ:
|
|
|
|
case PPCISD::BDZ: {
|
|
|
|
bool IsPPC64 = PPCSubTarget.isPPC64();
|
|
|
|
SDValue Ops[] = { N->getOperand(1), N->getOperand(0) };
|
|
|
|
return CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ ?
|
|
|
|
(IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
|
|
|
|
(IsPPC64 ? PPC::BDZ8 : PPC::BDZ),
|
|
|
|
MVT::Other, Ops, 2);
|
|
|
|
}
|
2006-11-18 06:37:34 +08:00
|
|
|
case PPCISD::COND_BRANCH: {
|
2008-11-06 01:16:24 +08:00
|
|
|
// Op #0 is the Chain.
|
2006-11-18 06:37:34 +08:00
|
|
|
// Op #1 is the PPC::PRED_* number.
|
|
|
|
// Op #2 is the CR#
|
|
|
|
// Op #3 is the Dest MBB
|
Eliminate the ISel priority queue, which used the topological order for a
priority function. Instead, just iterate over the AllNodes list, which is
already in topological order. This eliminates a fair amount of bookkeeping,
and speeds up the isel phase by about 15% on many testcases.
The impact on most targets is that AddToISelQueue calls can be simply removed.
In the x86 target, there are two additional notable changes.
The rule-bending AND+SHIFT optimization in MatchAddress that creates new
pre-isel nodes during isel is now a little more verbose, but more robust.
Instead of either creating an invalid DAG or creating an invalid topological
sort, as it has historically done, it can now just insert the new nodes into
the node list at a position where they will be consistent with the topological
ordering.
Also, the address-matching code has logic that checked to see if a node was
"already selected". However, when a node is selected, it has all its uses
taken away via ReplaceAllUsesWith or equivalent, so it won't recieve any
further visits from MatchAddress. This code is now removed.
llvm-svn: 58748
2008-11-05 12:14:16 +08:00
|
|
|
// Op #4 is the Flag.
|
2007-06-29 09:25:06 +08:00
|
|
|
// Prevent PPC::PRED_* from being selected into LI.
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Pred =
|
2008-09-13 00:56:44 +08:00
|
|
|
getI32Imm(cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3),
|
2006-11-18 06:37:34 +08:00
|
|
|
N->getOperand(0), N->getOperand(4) };
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 5);
|
2006-11-18 06:37:34 +08:00
|
|
|
}
|
2006-03-17 09:40:33 +08:00
|
|
|
case ISD::BR_CC: {
|
2005-08-22 02:50:37 +08:00
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
unsigned PCC = getPredicateForSetCC(CC);
|
|
|
|
|
|
|
|
if (N->getOperand(2).getValueType() == MVT::i1) {
|
|
|
|
unsigned Opc;
|
|
|
|
bool Swap;
|
|
|
|
switch (PCC) {
|
|
|
|
default: llvm_unreachable("Unexpected Boolean-operand predicate");
|
|
|
|
case PPC::PRED_LT: Opc = PPC::CRANDC; Swap = true; break;
|
|
|
|
case PPC::PRED_LE: Opc = PPC::CRORC; Swap = true; break;
|
|
|
|
case PPC::PRED_EQ: Opc = PPC::CREQV; Swap = false; break;
|
|
|
|
case PPC::PRED_GE: Opc = PPC::CRORC; Swap = false; break;
|
|
|
|
case PPC::PRED_GT: Opc = PPC::CRANDC; Swap = false; break;
|
|
|
|
case PPC::PRED_NE: Opc = PPC::CRXOR; Swap = false; break;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue BitComp(CurDAG->getMachineNode(Opc, dl, MVT::i1,
|
|
|
|
N->getOperand(Swap ? 3 : 2),
|
|
|
|
N->getOperand(Swap ? 2 : 3)), 0);
|
|
|
|
return CurDAG->SelectNodeTo(N, PPC::BC, MVT::Other,
|
|
|
|
BitComp, N->getOperand(4), N->getOperand(0));
|
|
|
|
}
|
|
|
|
|
2009-02-07 03:16:40 +08:00
|
|
|
SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl);
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
SDValue Ops[] = { getI32Imm(PCC), CondCode,
|
2006-08-27 16:14:06 +08:00
|
|
|
N->getOperand(4), N->getOperand(0) };
|
2009-08-12 04:47:22 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 4);
|
2005-08-22 02:50:37 +08:00
|
|
|
}
|
2006-04-23 02:53:45 +08:00
|
|
|
case ISD::BRIND: {
|
2006-06-10 09:15:02 +08:00
|
|
|
// FIXME: Should custom lower this.
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Chain = N->getOperand(0);
|
|
|
|
SDValue Target = N->getOperand(1);
|
2009-08-12 04:47:22 +08:00
|
|
|
unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8;
|
2011-06-03 23:47:49 +08:00
|
|
|
unsigned Reg = Target.getValueType() == MVT::i32 ? PPC::BCTR : PPC::BCTR8;
|
2011-12-08 12:36:44 +08:00
|
|
|
Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Target,
|
2009-09-26 02:54:59 +08:00
|
|
|
Chain), 0);
|
2011-06-03 23:47:49 +08:00
|
|
|
return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
|
2006-04-23 02:53:45 +08:00
|
|
|
}
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
case PPCISD::TOC_ENTRY: {
|
|
|
|
assert (PPCSubTarget.isPPC64() && "Only supported for 64-bit ABI");
|
|
|
|
|
2013-02-22 01:12:27 +08:00
|
|
|
// For medium and large code model, we generate two instructions as
|
|
|
|
// described below. Otherwise we allow SelectCodeCommon to handle this,
|
|
|
|
// selecting one of LDtoc, LDtocJTI, and LDtocCPT.
|
|
|
|
CodeModel::Model CModel = TM.getCodeModel();
|
|
|
|
if (CModel != CodeModel::Medium && CModel != CodeModel::Large)
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
// The first source operand is a TargetGlobalAddress or a
|
|
|
|
// TargetJumpTable. If it is an externally defined symbol, a symbol
|
|
|
|
// with common linkage, a function address, or a jump table address,
|
2013-02-22 01:12:27 +08:00
|
|
|
// or if we are generating code for large code model, we generate:
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
// LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>))
|
|
|
|
// Otherwise we generate:
|
|
|
|
// ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>)
|
|
|
|
SDValue GA = N->getOperand(0);
|
|
|
|
SDValue TOCbase = N->getOperand(1);
|
|
|
|
SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
|
|
|
|
TOCbase, GA);
|
|
|
|
|
2013-02-22 01:12:27 +08:00
|
|
|
if (isa<JumpTableSDNode>(GA) || CModel == CodeModel::Large)
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
|
|
|
|
SDValue(Tmp, 0));
|
|
|
|
|
|
|
|
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
|
|
|
|
const GlobalValue *GValue = G->getGlobal();
|
2013-01-08 03:29:18 +08:00
|
|
|
const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
|
|
|
|
const GlobalValue *RealGValue = GAlias ?
|
|
|
|
GAlias->resolveAliasedGlobal(false) : GValue;
|
|
|
|
const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
|
|
|
|
assert((GVar || isa<Function>(RealGValue)) &&
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
"Unexpected global value subclass!");
|
|
|
|
|
|
|
|
// An external variable is one without an initializer. For these,
|
|
|
|
// for variables with common linkage, and for Functions, generate
|
|
|
|
// the LDtocL form.
|
2013-01-08 03:29:18 +08:00
|
|
|
if (!GVar || !GVar->hasInitializer() || RealGValue->hasCommonLinkage() ||
|
|
|
|
RealGValue->hasAvailableExternallyLinkage())
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
|
|
|
|
SDValue(Tmp, 0));
|
|
|
|
}
|
|
|
|
|
|
|
|
return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
|
|
|
|
SDValue(Tmp, 0), GA);
|
|
|
|
}
|
2013-02-20 23:50:31 +08:00
|
|
|
case PPCISD::VADD_SPLAT: {
|
2013-02-21 04:41:42 +08:00
|
|
|
// This expands into one of three sequences, depending on whether
|
|
|
|
// the first operand is odd or even, positive or negative.
|
2013-02-20 23:50:31 +08:00
|
|
|
assert(isa<ConstantSDNode>(N->getOperand(0)) &&
|
|
|
|
isa<ConstantSDNode>(N->getOperand(1)) &&
|
|
|
|
"Invalid operand on VADD_SPLAT!");
|
2013-02-21 04:41:42 +08:00
|
|
|
|
|
|
|
int Elt = N->getConstantOperandVal(0);
|
2013-02-20 23:50:31 +08:00
|
|
|
int EltSize = N->getConstantOperandVal(1);
|
2013-02-21 04:41:42 +08:00
|
|
|
unsigned Opc1, Opc2, Opc3;
|
2013-02-20 23:50:31 +08:00
|
|
|
EVT VT;
|
2013-02-21 04:41:42 +08:00
|
|
|
|
2013-02-20 23:50:31 +08:00
|
|
|
if (EltSize == 1) {
|
|
|
|
Opc1 = PPC::VSPLTISB;
|
|
|
|
Opc2 = PPC::VADDUBM;
|
2013-02-21 04:41:42 +08:00
|
|
|
Opc3 = PPC::VSUBUBM;
|
2013-02-20 23:50:31 +08:00
|
|
|
VT = MVT::v16i8;
|
|
|
|
} else if (EltSize == 2) {
|
|
|
|
Opc1 = PPC::VSPLTISH;
|
|
|
|
Opc2 = PPC::VADDUHM;
|
2013-02-21 04:41:42 +08:00
|
|
|
Opc3 = PPC::VSUBUHM;
|
2013-02-20 23:50:31 +08:00
|
|
|
VT = MVT::v8i16;
|
|
|
|
} else {
|
|
|
|
assert(EltSize == 4 && "Invalid element size on VADD_SPLAT!");
|
|
|
|
Opc1 = PPC::VSPLTISW;
|
|
|
|
Opc2 = PPC::VADDUWM;
|
2013-02-21 04:41:42 +08:00
|
|
|
Opc3 = PPC::VSUBUWM;
|
2013-02-20 23:50:31 +08:00
|
|
|
VT = MVT::v4i32;
|
|
|
|
}
|
2013-02-21 04:41:42 +08:00
|
|
|
|
|
|
|
if ((Elt & 1) == 0) {
|
|
|
|
// Elt is even, in the range [-32,-18] + [16,30].
|
|
|
|
//
|
|
|
|
// Convert: VADD_SPLAT elt, size
|
|
|
|
// Into: tmp = VSPLTIS[BHW] elt
|
|
|
|
// VADDU[BHW]M tmp, tmp
|
|
|
|
// Where: [BHW] = B for size = 1, H for size = 2, W for size = 4
|
|
|
|
SDValue EltVal = getI32Imm(Elt >> 1);
|
|
|
|
SDNode *Tmp = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
|
|
|
|
SDValue TmpVal = SDValue(Tmp, 0);
|
|
|
|
return CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal);
|
|
|
|
|
|
|
|
} else if (Elt > 0) {
|
|
|
|
// Elt is odd and positive, in the range [17,31].
|
|
|
|
//
|
|
|
|
// Convert: VADD_SPLAT elt, size
|
|
|
|
// Into: tmp1 = VSPLTIS[BHW] elt-16
|
|
|
|
// tmp2 = VSPLTIS[BHW] -16
|
|
|
|
// VSUBU[BHW]M tmp1, tmp2
|
|
|
|
SDValue EltVal = getI32Imm(Elt - 16);
|
|
|
|
SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
|
|
|
|
EltVal = getI32Imm(-16);
|
|
|
|
SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
|
|
|
|
return CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0),
|
|
|
|
SDValue(Tmp2, 0));
|
|
|
|
|
|
|
|
} else {
|
|
|
|
// Elt is odd and negative, in the range [-31,-17].
|
|
|
|
//
|
|
|
|
// Convert: VADD_SPLAT elt, size
|
|
|
|
// Into: tmp1 = VSPLTIS[BHW] elt+16
|
|
|
|
// tmp2 = VSPLTIS[BHW] -16
|
|
|
|
// VADDU[BHW]M tmp1, tmp2
|
|
|
|
SDValue EltVal = getI32Imm(Elt + 16);
|
|
|
|
SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
|
|
|
|
EltVal = getI32Imm(-16);
|
|
|
|
SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
|
|
|
|
return CurDAG->getMachineNode(Opc2, dl, VT, SDValue(Tmp1, 0),
|
|
|
|
SDValue(Tmp2, 0));
|
|
|
|
}
|
2013-02-20 23:50:31 +08:00
|
|
|
}
|
2005-08-18 03:33:03 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2010-01-05 09:24:18 +08:00
|
|
|
return SelectCode(N);
|
2005-08-18 03:33:03 +08:00
|
|
|
}
|
|
|
|
|
2014-01-03 06:09:39 +08:00
|
|
|
/// PostprocessISelDAG - Perform some late peephole optimizations
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
/// on the DAG representation.
|
|
|
|
void PPCDAGToDAGISel::PostprocessISelDAG() {
|
|
|
|
|
|
|
|
// Skip peepholes at -O0.
|
|
|
|
if (TM.getOptLevel() == CodeGenOpt::None)
|
|
|
|
return;
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
PeepholePPC64();
|
|
|
|
PeepholdCROps();
|
|
|
|
}
|
|
|
|
|
2014-02-28 14:11:16 +08:00
|
|
|
// Check if all users of this node will become isel where the second operand
|
|
|
|
// is the constant zero. If this is so, and if we can negate the condition,
|
|
|
|
// then we can flip the true and false operands. This will allow the zero to
|
|
|
|
// be folded with the isel so that we don't need to materialize a register
|
|
|
|
// containing zero.
|
|
|
|
bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
|
|
|
|
// If we're not using isel, then this does not matter.
|
|
|
|
if (!PPCSubTarget.hasISEL())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
|
|
|
|
UI != UE; ++UI) {
|
|
|
|
SDNode *User = *UI;
|
|
|
|
if (!User->isMachineOpcode())
|
|
|
|
return false;
|
|
|
|
if (User->getMachineOpcode() != PPC::SELECT_I4 &&
|
|
|
|
User->getMachineOpcode() != PPC::SELECT_I8)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
SDNode *Op2 = User->getOperand(2).getNode();
|
|
|
|
if (!Op2->isMachineOpcode())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (Op2->getMachineOpcode() != PPC::LI &&
|
|
|
|
Op2->getMachineOpcode() != PPC::LI8)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op2->getOperand(0));
|
|
|
|
if (!C)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!C->isNullValue())
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) {
|
|
|
|
SmallVector<SDNode *, 4> ToReplace;
|
|
|
|
for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
|
|
|
|
UI != UE; ++UI) {
|
|
|
|
SDNode *User = *UI;
|
|
|
|
assert((User->getMachineOpcode() == PPC::SELECT_I4 ||
|
|
|
|
User->getMachineOpcode() == PPC::SELECT_I8) &&
|
|
|
|
"Must have all select users");
|
|
|
|
ToReplace.push_back(User);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (SmallVector<SDNode *, 4>::iterator UI = ToReplace.begin(),
|
|
|
|
UE = ToReplace.end(); UI != UE; ++UI) {
|
|
|
|
SDNode *User = *UI;
|
|
|
|
SDNode *ResNode =
|
|
|
|
CurDAG->getMachineNode(User->getMachineOpcode(), SDLoc(User),
|
|
|
|
User->getValueType(0), User->getOperand(0),
|
|
|
|
User->getOperand(2),
|
|
|
|
User->getOperand(1));
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "CR Peephole replacing:\nOld: ");
|
|
|
|
DEBUG(User->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\nNew: ");
|
|
|
|
DEBUG(ResNode->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\n");
|
|
|
|
|
|
|
|
ReplaceUses(User, ResNode);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
void PPCDAGToDAGISel::PeepholdCROps() {
|
|
|
|
bool IsModified;
|
|
|
|
do {
|
|
|
|
IsModified = false;
|
|
|
|
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
|
|
|
|
E = CurDAG->allnodes_end(); I != E; ++I) {
|
|
|
|
MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
|
|
|
|
if (!MachineNode || MachineNode->use_empty())
|
|
|
|
continue;
|
|
|
|
SDNode *ResNode = MachineNode;
|
|
|
|
|
|
|
|
bool Op1Set = false, Op1Unset = false,
|
|
|
|
Op1Not = false,
|
|
|
|
Op2Set = false, Op2Unset = false,
|
|
|
|
Op2Not = false;
|
|
|
|
|
|
|
|
unsigned Opcode = MachineNode->getMachineOpcode();
|
|
|
|
switch (Opcode) {
|
|
|
|
default: break;
|
|
|
|
case PPC::CRAND:
|
|
|
|
case PPC::CRNAND:
|
|
|
|
case PPC::CROR:
|
|
|
|
case PPC::CRXOR:
|
|
|
|
case PPC::CRNOR:
|
|
|
|
case PPC::CREQV:
|
|
|
|
case PPC::CRANDC:
|
|
|
|
case PPC::CRORC: {
|
|
|
|
SDValue Op = MachineNode->getOperand(1);
|
|
|
|
if (Op.isMachineOpcode()) {
|
|
|
|
if (Op.getMachineOpcode() == PPC::CRSET)
|
|
|
|
Op2Set = true;
|
|
|
|
else if (Op.getMachineOpcode() == PPC::CRUNSET)
|
|
|
|
Op2Unset = true;
|
|
|
|
else if (Op.getMachineOpcode() == PPC::CRNOR &&
|
|
|
|
Op.getOperand(0) == Op.getOperand(1))
|
|
|
|
Op2Not = true;
|
|
|
|
}
|
|
|
|
} // fallthrough
|
|
|
|
case PPC::BC:
|
|
|
|
case PPC::BCn:
|
|
|
|
case PPC::SELECT_I4:
|
|
|
|
case PPC::SELECT_I8:
|
|
|
|
case PPC::SELECT_F4:
|
|
|
|
case PPC::SELECT_F8:
|
|
|
|
case PPC::SELECT_VRRC: {
|
|
|
|
SDValue Op = MachineNode->getOperand(0);
|
|
|
|
if (Op.isMachineOpcode()) {
|
|
|
|
if (Op.getMachineOpcode() == PPC::CRSET)
|
|
|
|
Op1Set = true;
|
|
|
|
else if (Op.getMachineOpcode() == PPC::CRUNSET)
|
|
|
|
Op1Unset = true;
|
|
|
|
else if (Op.getMachineOpcode() == PPC::CRNOR &&
|
|
|
|
Op.getOperand(0) == Op.getOperand(1))
|
|
|
|
Op1Not = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-02-28 14:11:16 +08:00
|
|
|
bool SelectSwap = false;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
switch (Opcode) {
|
|
|
|
default: break;
|
|
|
|
case PPC::CRAND:
|
|
|
|
if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
|
|
|
|
// x & x = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Set)
|
|
|
|
// 1 & y = y
|
|
|
|
ResNode = MachineNode->getOperand(1).getNode();
|
|
|
|
else if (Op2Set)
|
|
|
|
// x & 1 = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Unset || Op2Unset)
|
|
|
|
// x & 0 = 0 & y = 0
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Not)
|
|
|
|
// ~x & y = andc(y, x)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(0).
|
|
|
|
getOperand(0));
|
|
|
|
else if (Op2Not)
|
|
|
|
// x & ~y = andc(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1).
|
|
|
|
getOperand(0));
|
2014-02-28 14:11:16 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode))
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1)),
|
|
|
|
SelectSwap = true;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::CRNAND:
|
|
|
|
if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
|
|
|
|
// nand(x, x) -> nor(x, x)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(0));
|
|
|
|
else if (Op1Set)
|
|
|
|
// nand(1, y) -> nor(y, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Set)
|
|
|
|
// nand(x, 1) -> nor(x, x)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(0));
|
|
|
|
else if (Op1Unset || Op2Unset)
|
|
|
|
// nand(x, 0) = nand(0, y) = 1
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Not)
|
|
|
|
// nand(~x, y) = ~(~x & y) = x | ~y = orc(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Not)
|
|
|
|
// nand(x, ~y) = ~x | y = orc(y, x)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(0));
|
2014-02-28 14:11:16 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode))
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1)),
|
|
|
|
SelectSwap = true;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::CROR:
|
|
|
|
if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
|
|
|
|
// x | x = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Set || Op2Set)
|
|
|
|
// x | 1 = 1 | y = 1
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Unset)
|
|
|
|
// 0 | y = y
|
|
|
|
ResNode = MachineNode->getOperand(1).getNode();
|
|
|
|
else if (Op2Unset)
|
|
|
|
// x | 0 = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Not)
|
|
|
|
// ~x | y = orc(y, x)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(0).
|
|
|
|
getOperand(0));
|
|
|
|
else if (Op2Not)
|
|
|
|
// x | ~y = orc(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1).
|
|
|
|
getOperand(0));
|
2014-02-28 14:11:16 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode))
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1)),
|
|
|
|
SelectSwap = true;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::CRXOR:
|
|
|
|
if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
|
|
|
|
// xor(x, x) = 0
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Set)
|
|
|
|
// xor(1, y) -> nor(y, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Set)
|
|
|
|
// xor(x, 1) -> nor(x, x)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(0));
|
|
|
|
else if (Op1Unset)
|
|
|
|
// xor(0, y) = y
|
|
|
|
ResNode = MachineNode->getOperand(1).getNode();
|
|
|
|
else if (Op2Unset)
|
|
|
|
// xor(x, 0) = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Not)
|
|
|
|
// xor(~x, y) = eqv(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Not)
|
|
|
|
// xor(x, ~y) = eqv(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1).
|
|
|
|
getOperand(0));
|
2014-02-28 14:11:16 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode))
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1)),
|
|
|
|
SelectSwap = true;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::CRNOR:
|
|
|
|
if (Op1Set || Op2Set)
|
|
|
|
// nor(1, y) -> 0
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Unset)
|
|
|
|
// nor(0, y) = ~y -> nor(y, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Unset)
|
|
|
|
// nor(x, 0) = ~x
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(0));
|
|
|
|
else if (Op1Not)
|
|
|
|
// nor(~x, y) = andc(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Not)
|
|
|
|
// nor(x, ~y) = andc(y, x)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(0));
|
2014-02-28 14:11:16 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode))
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1)),
|
|
|
|
SelectSwap = true;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::CREQV:
|
|
|
|
if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
|
|
|
|
// eqv(x, x) = 1
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Set)
|
|
|
|
// eqv(1, y) = y
|
|
|
|
ResNode = MachineNode->getOperand(1).getNode();
|
|
|
|
else if (Op2Set)
|
|
|
|
// eqv(x, 1) = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Unset)
|
|
|
|
// eqv(0, y) = ~y -> nor(y, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Unset)
|
|
|
|
// eqv(x, 0) = ~x
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(0));
|
|
|
|
else if (Op1Not)
|
|
|
|
// eqv(~x, y) = xor(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Not)
|
|
|
|
// eqv(x, ~y) = xor(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1).
|
|
|
|
getOperand(0));
|
2014-02-28 14:11:16 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode))
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1)),
|
|
|
|
SelectSwap = true;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::CRANDC:
|
|
|
|
if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
|
|
|
|
// andc(x, x) = 0
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Set)
|
|
|
|
// andc(1, y) = ~y
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op1Unset || Op2Set)
|
|
|
|
// andc(0, y) = andc(x, 1) = 0
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op2Unset)
|
|
|
|
// andc(x, 0) = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Not)
|
|
|
|
// andc(~x, y) = ~(x | y) = nor(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Not)
|
|
|
|
// andc(x, ~y) = x & y
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1).
|
|
|
|
getOperand(0));
|
2014-02-28 14:11:16 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode))
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(0)),
|
|
|
|
SelectSwap = true;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::CRORC:
|
|
|
|
if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
|
|
|
|
// orc(x, x) = 1
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Set || Op2Unset)
|
|
|
|
// orc(1, y) = orc(x, 0) = 1
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op2Set)
|
|
|
|
// orc(x, 1) = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Unset)
|
|
|
|
// orc(0, y) = ~y
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op1Not)
|
|
|
|
// orc(~x, y) = ~(x & y) = nand(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Not)
|
|
|
|
// orc(x, ~y) = x | y
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1).
|
|
|
|
getOperand(0));
|
2014-02-28 14:11:16 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode))
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(0)),
|
|
|
|
SelectSwap = true;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::SELECT_I4:
|
|
|
|
case PPC::SELECT_I8:
|
|
|
|
case PPC::SELECT_F4:
|
|
|
|
case PPC::SELECT_F8:
|
|
|
|
case PPC::SELECT_VRRC:
|
|
|
|
if (Op1Set)
|
|
|
|
ResNode = MachineNode->getOperand(1).getNode();
|
|
|
|
else if (Op1Unset)
|
|
|
|
ResNode = MachineNode->getOperand(2).getNode();
|
|
|
|
else if (Op1Not)
|
|
|
|
ResNode = CurDAG->getMachineNode(MachineNode->getMachineOpcode(),
|
|
|
|
SDLoc(MachineNode),
|
|
|
|
MachineNode->getValueType(0),
|
|
|
|
MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(2),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
break;
|
|
|
|
case PPC::BC:
|
|
|
|
case PPC::BCn:
|
|
|
|
if (Op1Not)
|
|
|
|
ResNode = CurDAG->getMachineNode(Opcode == PPC::BC ? PPC::BCn :
|
|
|
|
PPC::BC,
|
|
|
|
SDLoc(MachineNode),
|
|
|
|
MVT::Other,
|
|
|
|
MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(2));
|
|
|
|
// FIXME: Handle Op1Set, Op1Unset here too.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-02-28 14:11:16 +08:00
|
|
|
// If we're inverting this node because it is used only by selects that
|
|
|
|
// we'd like to swap, then swap the selects before the node replacement.
|
|
|
|
if (SelectSwap)
|
|
|
|
SwapAllSelectUsers(MachineNode);
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
if (ResNode != MachineNode) {
|
|
|
|
DEBUG(dbgs() << "CR Peephole replacing:\nOld: ");
|
|
|
|
DEBUG(MachineNode->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\nNew: ");
|
|
|
|
DEBUG(ResNode->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\n");
|
|
|
|
|
|
|
|
ReplaceUses(MachineNode, ResNode);
|
|
|
|
IsModified = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (IsModified)
|
|
|
|
CurDAG->RemoveDeadNodes();
|
|
|
|
} while (IsModified);
|
|
|
|
}
|
|
|
|
|
|
|
|
void PPCDAGToDAGISel::PeepholePPC64() {
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
// These optimizations are currently supported only for 64-bit SVR4.
|
|
|
|
if (PPCSubTarget.isDarwin() || !PPCSubTarget.isPPC64())
|
|
|
|
return;
|
|
|
|
|
|
|
|
SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
|
|
|
|
++Position;
|
|
|
|
|
|
|
|
while (Position != CurDAG->allnodes_begin()) {
|
|
|
|
SDNode *N = --Position;
|
|
|
|
// Skip dead nodes and any non-machine opcodes.
|
|
|
|
if (N->use_empty() || !N->isMachineOpcode())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
unsigned FirstOp;
|
|
|
|
unsigned StorageOpcode = N->getMachineOpcode();
|
|
|
|
|
|
|
|
switch (StorageOpcode) {
|
|
|
|
default: continue;
|
|
|
|
|
|
|
|
case PPC::LBZ:
|
|
|
|
case PPC::LBZ8:
|
|
|
|
case PPC::LD:
|
|
|
|
case PPC::LFD:
|
|
|
|
case PPC::LFS:
|
|
|
|
case PPC::LHA:
|
|
|
|
case PPC::LHA8:
|
|
|
|
case PPC::LHZ:
|
|
|
|
case PPC::LHZ8:
|
|
|
|
case PPC::LWA:
|
|
|
|
case PPC::LWZ:
|
|
|
|
case PPC::LWZ8:
|
|
|
|
FirstOp = 0;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case PPC::STB:
|
|
|
|
case PPC::STB8:
|
|
|
|
case PPC::STD:
|
|
|
|
case PPC::STFD:
|
|
|
|
case PPC::STFS:
|
|
|
|
case PPC::STH:
|
|
|
|
case PPC::STH8:
|
|
|
|
case PPC::STW:
|
|
|
|
case PPC::STW8:
|
|
|
|
FirstOp = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If this is a load or store with a zero offset, we may be able to
|
|
|
|
// fold an add-immediate into the memory operation.
|
|
|
|
if (!isa<ConstantSDNode>(N->getOperand(FirstOp)) ||
|
|
|
|
N->getConstantOperandVal(FirstOp) != 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
SDValue Base = N->getOperand(FirstOp + 1);
|
|
|
|
if (!Base.isMachineOpcode())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
unsigned Flags = 0;
|
|
|
|
bool ReplaceFlags = true;
|
|
|
|
|
|
|
|
// When the feeding operation is an add-immediate of some sort,
|
|
|
|
// determine whether we need to add relocation information to the
|
|
|
|
// target flags on the immediate operand when we fold it into the
|
|
|
|
// load instruction.
|
|
|
|
//
|
|
|
|
// For something like ADDItocL, the relocation information is
|
|
|
|
// inferred from the opcode; when we process it in the AsmPrinter,
|
|
|
|
// we add the necessary relocation there. A load, though, can receive
|
|
|
|
// relocation from various flavors of ADDIxxx, so we need to carry
|
|
|
|
// the relocation information in the target flags.
|
|
|
|
switch (Base.getMachineOpcode()) {
|
|
|
|
default: continue;
|
|
|
|
|
|
|
|
case PPC::ADDI8:
|
2013-03-26 18:55:20 +08:00
|
|
|
case PPC::ADDI:
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
// In some cases (such as TLS) the relocation information
|
|
|
|
// is already in place on the operand, so copying the operand
|
|
|
|
// is sufficient.
|
|
|
|
ReplaceFlags = false;
|
|
|
|
// For these cases, the immediate may not be divisible by 4, in
|
|
|
|
// which case the fold is illegal for DS-form instructions. (The
|
|
|
|
// other cases provide aligned addresses and are always safe.)
|
|
|
|
if ((StorageOpcode == PPC::LWA ||
|
|
|
|
StorageOpcode == PPC::LD ||
|
|
|
|
StorageOpcode == PPC::STD) &&
|
|
|
|
(!isa<ConstantSDNode>(Base.getOperand(1)) ||
|
|
|
|
Base.getConstantOperandVal(1) % 4 != 0))
|
|
|
|
continue;
|
|
|
|
break;
|
|
|
|
case PPC::ADDIdtprelL:
|
2013-06-21 22:42:20 +08:00
|
|
|
Flags = PPCII::MO_DTPREL_LO;
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
break;
|
|
|
|
case PPC::ADDItlsldL:
|
2013-06-21 22:42:20 +08:00
|
|
|
Flags = PPCII::MO_TLSLD_LO;
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
break;
|
|
|
|
case PPC::ADDItocL:
|
2013-06-21 22:42:20 +08:00
|
|
|
Flags = PPCII::MO_TOC_LO;
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We found an opportunity. Reverse the operands from the add
|
|
|
|
// immediate and substitute them into the load or store. If
|
|
|
|
// needed, update the target flags for the immediate operand to
|
|
|
|
// reflect the necessary relocation information.
|
|
|
|
DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: ");
|
|
|
|
DEBUG(Base->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\nN: ");
|
|
|
|
DEBUG(N->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\n");
|
|
|
|
|
|
|
|
SDValue ImmOpnd = Base.getOperand(1);
|
|
|
|
|
|
|
|
// If the relocation information isn't already present on the
|
|
|
|
// immediate operand, add it now.
|
|
|
|
if (ReplaceFlags) {
|
2013-02-21 22:35:42 +08:00
|
|
|
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(GA);
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
const GlobalValue *GV = GA->getGlobal();
|
Index: test/CodeGen/PowerPC/reloc-align.ll
===================================================================
--- test/CodeGen/PowerPC/reloc-align.ll (revision 0)
+++ test/CodeGen/PowerPC/reloc-align.ll (revision 0)
@@ -0,0 +1,34 @@
+; RUN: llc -mcpu=pwr7 -O1 < %s | FileCheck %s
+
+; This test verifies that the peephole optimization of address accesses
+; does not produce a load or store with a relocation that can't be
+; satisfied for a given instruction encoding. Reduced from a test supplied
+; by Hal Finkel.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S1 = type { [8 x i8] }
+
+@main.l_1554 = internal global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 -1, i8 -6, i8 57, i8 62, i8 -48, i8 0, i8 58, i8 80 }, align 1
+
+; Function Attrs: nounwind readonly
+define signext i32 @main() #0 {
+entry:
+ %call = tail call fastcc signext i32 @func_90(%struct.S1* byval bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @main.l_1554 to %struct.S1*))
+; CHECK-NOT: ld {{[0-9]+}}, main.l_1554@toc@l
+ ret i32 %call
+}
+
+; Function Attrs: nounwind readonly
+define internal fastcc signext i32 @func_90(%struct.S1* byval nocapture %p_91) #0 {
+entry:
+ %0 = bitcast %struct.S1* %p_91 to i64*
+ %bf.load = load i64* %0, align 1
+ %bf.shl = shl i64 %bf.load, 26
+ %bf.ashr = ashr i64 %bf.shl, 54
+ %bf.cast = trunc i64 %bf.ashr to i32
+ ret i32 %bf.cast
+}
+
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: lib/Target/PowerPC/PPCAsmPrinter.cpp
===================================================================
--- lib/Target/PowerPC/PPCAsmPrinter.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCAsmPrinter.cpp (working copy)
@@ -679,7 +679,26 @@ void PPCAsmPrinter::EmitInstruction(const MachineI
OutStreamer.EmitRawText(StringRef("\tmsync"));
return;
}
+ break;
+ case PPC::LD:
+ case PPC::STD:
+ case PPC::LWA: {
+ // Verify alignment is legal, so we don't create relocations
+ // that can't be supported.
+ // FIXME: This test is currently disabled for Darwin. The test
+ // suite shows a handful of test cases that fail this check for
+ // Darwin. Those need to be investigated before this sanity test
+ // can be enabled for those subtargets.
+ if (!Subtarget.isDarwin()) {
+ unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
+ llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
+ }
+ // Now process the instruction normally.
+ break;
}
+ }
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
OutStreamer.EmitInstruction(TmpInst);
Index: lib/Target/PowerPC/PPCISelDAGToDAG.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelDAGToDAG.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCISelDAGToDAG.cpp (working copy)
@@ -1530,6 +1530,14 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
SDLoc dl(GA);
const GlobalValue *GV = GA->getGlobal();
+ // We can't perform this optimization for data whose alignment
+ // is insufficient for the instruction encoding.
+ if (GV->getAlignment() < 4 &&
+ (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
+ StorageOpcode == PPC::LWA)) {
+ DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
+ continue;
+ }
ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
} else if (ConstantPoolSDNode *CP =
dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
llvm-svn: 185380
2013-07-02 04:52:27 +08:00
|
|
|
// We can't perform this optimization for data whose alignment
|
|
|
|
// is insufficient for the instruction encoding.
|
|
|
|
if (GV->getAlignment() < 4 &&
|
|
|
|
(StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
|
|
|
|
StorageOpcode == PPC::LWA)) {
|
|
|
|
DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
|
|
|
|
continue;
|
|
|
|
}
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
|
2013-02-22 01:26:05 +08:00
|
|
|
} else if (ConstantPoolSDNode *CP =
|
|
|
|
dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
|
2013-02-21 22:35:42 +08:00
|
|
|
const Constant *C = CP->getConstVal();
|
|
|
|
ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64,
|
|
|
|
CP->getAlignment(),
|
|
|
|
0, Flags);
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (FirstOp == 1) // Store
|
|
|
|
(void)CurDAG->UpdateNodeOperands(N, N->getOperand(0), ImmOpnd,
|
|
|
|
Base.getOperand(0), N->getOperand(3));
|
|
|
|
else // Load
|
|
|
|
(void)CurDAG->UpdateNodeOperands(N, ImmOpnd, Base.getOperand(0),
|
|
|
|
N->getOperand(2));
|
|
|
|
|
|
|
|
// The add-immediate may now be dead, in which case remove it.
|
|
|
|
if (Base.getNode()->use_empty())
|
|
|
|
CurDAG->RemoveDeadNode(Base.getNode());
|
|
|
|
}
|
|
|
|
}
|
2005-08-18 03:33:03 +08:00
|
|
|
|
2006-06-10 09:15:02 +08:00
|
|
|
|
2010-12-24 12:28:06 +08:00
|
|
|
/// createPPCISelDag - This pass converts a legalized DAG into a
|
2005-08-18 03:33:03 +08:00
|
|
|
/// PowerPC-specific DAG, ready for instruction scheduling.
|
|
|
|
///
|
2006-03-14 07:20:37 +08:00
|
|
|
FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
|
2005-10-18 08:28:58 +08:00
|
|
|
return new PPCDAGToDAGISel(TM);
|
2005-08-18 03:33:03 +08:00
|
|
|
}
|
|
|
|
|
2013-02-14 01:40:07 +08:00
|
|
|
static void initializePassOnce(PassRegistry &Registry) {
|
|
|
|
const char *Name = "PowerPC DAG->DAG Pattern Instruction Selection";
|
|
|
|
PassInfo *PI = new PassInfo(Name, "ppc-codegen", &SelectionDAGISel::ID, 0,
|
|
|
|
false, false);
|
|
|
|
Registry.registerPass(*PI, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
void llvm::initializePPCDAGToDAGISelPass(PassRegistry &Registry) {
|
|
|
|
CALL_ONCE_INITIALIZATION(initializePassOnce);
|
|
|
|
}
|
|
|
|
|