2005-10-16 13:39:50 +08:00
|
|
|
//===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===//
|
2005-08-18 03:33:03 +08:00
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 04:36:04 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2005-08-18 03:33:03 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2005-10-16 13:39:50 +08:00
|
|
|
// This file defines a pattern matching instruction selector for PowerPC,
|
2005-08-18 03:33:03 +08:00
|
|
|
// converting from a legalized dag to a PPC dag.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2005-10-15 07:51:18 +08:00
|
|
|
#include "PPC.h"
|
2011-07-26 08:24:13 +08:00
|
|
|
#include "MCTargetDesc/PPCPredicates.h"
|
2014-07-19 07:29:49 +08:00
|
|
|
#include "PPCMachineFunctionInfo.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "PPCTargetMachine.h"
|
2015-12-12 08:32:00 +08:00
|
|
|
#include "llvm/Analysis/BranchProbabilityInfo.h"
|
|
|
|
#include "llvm/CodeGen/FunctionLoweringInfo.h"
|
2005-08-20 06:38:53 +08:00
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
2007-12-31 12:13:23 +08:00
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2005-08-18 03:33:03 +08:00
|
|
|
#include "llvm/CodeGen/SelectionDAG.h"
|
|
|
|
#include "llvm/CodeGen/SelectionDAGISel.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/Constants.h"
|
|
|
|
#include "llvm/IR/Function.h"
|
2013-01-19 16:03:47 +08:00
|
|
|
#include "llvm/IR/GlobalAlias.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/GlobalValue.h"
|
|
|
|
#include "llvm/IR/GlobalVariable.h"
|
|
|
|
#include "llvm/IR/Intrinsics.h"
|
2014-11-12 23:16:30 +08:00
|
|
|
#include "llvm/IR/Module.h"
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
2005-08-18 03:33:03 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2009-07-09 04:53:28 +08:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Support/MathExtras.h"
|
2009-07-09 04:53:28 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Target/TargetOptions.h"
|
2005-08-18 03:33:03 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2014-04-22 10:41:26 +08:00
|
|
|
#define DEBUG_TYPE "ppc-codegen"
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
// FIXME: Remove this once the bug has been fixed!
|
|
|
|
cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
|
|
|
|
cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
|
|
|
|
|
2015-02-07 01:51:54 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
UseBitPermRewriter("ppc-use-bit-perm-rewriter", cl::init(true),
|
|
|
|
cl::desc("use aggressive ppc isel for bit permutations"),
|
|
|
|
cl::Hidden);
|
|
|
|
static cl::opt<bool> BPermRewriterNoMasking(
|
|
|
|
"ppc-bit-perm-rewriter-stress-rotates",
|
|
|
|
cl::desc("stress rotate selection in aggressive ppc isel for "
|
|
|
|
"bit permutations"),
|
|
|
|
cl::Hidden);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
|
2015-12-12 08:32:00 +08:00
|
|
|
static cl::opt<bool> EnableBranchHint(
|
|
|
|
"ppc-use-branch-hint", cl::init(true),
|
|
|
|
cl::desc("Enable static hinting of branches on ppc"),
|
|
|
|
cl::Hidden);
|
|
|
|
|
2005-08-18 03:33:03 +08:00
|
|
|
namespace {
|
|
|
|
//===--------------------------------------------------------------------===//
|
2005-10-18 08:28:58 +08:00
|
|
|
/// PPCDAGToDAGISel - PPC specific code to select PPC machine
|
2005-08-18 03:33:03 +08:00
|
|
|
/// instructions for SelectionDAG operations.
|
|
|
|
///
|
2009-10-25 14:33:48 +08:00
|
|
|
class PPCDAGToDAGISel : public SelectionDAGISel {
|
2010-04-17 23:26:15 +08:00
|
|
|
const PPCTargetMachine &TM;
|
2014-05-22 09:07:24 +08:00
|
|
|
const PPCSubtarget *PPCSubTarget;
|
2015-01-31 06:02:31 +08:00
|
|
|
const PPCTargetLowering *PPCLowering;
|
2005-08-20 06:38:53 +08:00
|
|
|
unsigned GlobalBaseReg;
|
2005-08-18 03:33:03 +08:00
|
|
|
public:
|
2008-07-08 02:00:37 +08:00
|
|
|
explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
|
2016-06-03 18:13:31 +08:00
|
|
|
: SelectionDAGISel(tm), TM(tm) {}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2014-04-29 15:57:37 +08:00
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
2005-08-20 06:38:53 +08:00
|
|
|
// Make sure we re-emit a set of the global base reg if necessary
|
|
|
|
GlobalBaseReg = 0;
|
2015-01-31 06:02:31 +08:00
|
|
|
PPCSubTarget = &MF.getSubtarget<PPCSubtarget>();
|
|
|
|
PPCLowering = PPCSubTarget->getTargetLowering();
|
2009-08-01 02:16:33 +08:00
|
|
|
SelectionDAGISel::runOnMachineFunction(MF);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2014-05-22 09:07:24 +08:00
|
|
|
if (!PPCSubTarget->isSVR4ABI())
|
2012-10-11 04:54:15 +08:00
|
|
|
InsertVRSaveCode(MF);
|
|
|
|
|
2006-03-17 02:25:23 +08:00
|
|
|
return true;
|
2005-08-20 06:38:53 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2015-01-03 09:16:37 +08:00
|
|
|
void PreprocessISelDAG() override;
|
2014-04-29 15:57:37 +08:00
|
|
|
void PostprocessISelDAG() override;
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
|
2005-08-18 03:33:03 +08:00
|
|
|
/// getI32Imm - Return a target constant with the specified value, of type
|
|
|
|
/// i32.
|
2016-06-12 23:39:02 +08:00
|
|
|
inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
|
2015-04-28 22:05:47 +08:00
|
|
|
return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
|
2005-08-18 03:33:03 +08:00
|
|
|
}
|
2005-08-20 06:38:53 +08:00
|
|
|
|
2006-06-27 08:04:13 +08:00
|
|
|
/// getI64Imm - Return a target constant with the specified value, of type
|
|
|
|
/// i64.
|
2016-06-12 23:39:02 +08:00
|
|
|
inline SDValue getI64Imm(uint64_t Imm, const SDLoc &dl) {
|
2015-04-28 22:05:47 +08:00
|
|
|
return CurDAG->getTargetConstant(Imm, dl, MVT::i64);
|
2006-06-27 08:04:13 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-06-27 08:04:13 +08:00
|
|
|
/// getSmallIPtrImm - Return a target constant of pointer type.
|
2016-06-12 23:39:02 +08:00
|
|
|
inline SDValue getSmallIPtrImm(unsigned Imm, const SDLoc &dl) {
|
2015-07-09 10:09:04 +08:00
|
|
|
return CurDAG->getTargetConstant(
|
|
|
|
Imm, dl, PPCLowering->getPointerTy(CurDAG->getDataLayout()));
|
2006-06-27 08:04:13 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-09-22 13:01:56 +08:00
|
|
|
/// isRotateAndMask - Returns true if Mask and Shift can be folded into a
|
|
|
|
/// rotate and mask opcode and mask operation.
|
2009-11-24 09:09:07 +08:00
|
|
|
static bool isRotateAndMask(SDNode *N, unsigned Mask, bool isShiftMask,
|
2006-09-22 13:01:56 +08:00
|
|
|
unsigned &SH, unsigned &MB, unsigned &ME);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-08-20 06:38:53 +08:00
|
|
|
/// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC
|
|
|
|
/// base register. Return the virtual register that holds this value.
|
2006-08-26 13:34:46 +08:00
|
|
|
SDNode *getGlobalBaseReg();
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
void selectFrameIndex(SDNode *SN, SDNode *N, unsigned Offset = 0);
|
[PowerPC] Better lowering for add/or of a FrameIndex
If we have an add (or an or that is really an add), where one operand is a
FrameIndex and the other operand is a small constant, we can combine the
lowering of the FrameIndex (which is lowered as an add of the FI and a zero
offset) with the constant operand.
Amusingly, this is an old potential improvement entry from
lib/Target/PowerPC/README.txt which had never been resolved. In short, we used
to lower:
%X = alloca { i32, i32 }
%Y = getelementptr {i32,i32}* %X, i32 0, i32 1
ret i32* %Y
as:
addi 3, 1, -8
ori 3, 3, 4
blr
and now we produce:
addi 3, 1, -4
blr
which is much more sensible.
llvm-svn: 224071
2014-12-12 06:51:06 +08:00
|
|
|
|
2005-08-18 03:33:03 +08:00
|
|
|
// Select - Convert the specified operand from a target-independent to a
|
|
|
|
// target-specific node if it hasn't already been changed.
|
2016-05-21 05:43:23 +08:00
|
|
|
void Select(SDNode *N) override;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
bool tryBitfieldInsert(SDNode *N);
|
|
|
|
bool tryBitPermutation(SDNode *N);
|
2005-08-19 08:38:14 +08:00
|
|
|
|
2005-08-22 02:50:37 +08:00
|
|
|
/// SelectCC - Select a comparison of the specified values with the
|
|
|
|
/// specified condition code, returning the CR# of the expression.
|
2016-06-12 23:39:02 +08:00
|
|
|
SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
|
|
|
|
const SDLoc &dl);
|
2005-08-22 02:50:37 +08:00
|
|
|
|
2005-12-20 07:25:09 +08:00
|
|
|
/// SelectAddrImm - Returns true if the address N can be represented by
|
|
|
|
/// a base register plus a signed 16-bit displacement [r+imm].
|
2010-09-22 04:31:19 +08:00
|
|
|
bool SelectAddrImm(SDValue N, SDValue &Disp,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue &Base) {
|
2014-05-22 09:07:24 +08:00
|
|
|
return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
|
2006-11-08 10:15:41 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-11-16 08:41:37 +08:00
|
|
|
/// SelectAddrImmOffs - Return true if the operand is valid for a preinc
|
2013-03-22 22:58:17 +08:00
|
|
|
/// immediate field. Note that the operand at this point is already the
|
|
|
|
/// result of a prior SelectAddressRegImm call.
|
2010-09-22 04:31:19 +08:00
|
|
|
bool SelectAddrImmOffs(SDValue N, SDValue &Out) const {
|
2013-03-22 22:58:17 +08:00
|
|
|
if (N.getOpcode() == ISD::TargetConstant ||
|
2012-06-22 04:10:48 +08:00
|
|
|
N.getOpcode() == ISD::TargetGlobalAddress) {
|
2012-06-19 10:34:32 +08:00
|
|
|
Out = N;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2005-12-20 07:25:09 +08:00
|
|
|
/// SelectAddrIdx - Given the specified addressed, check to see if it can be
|
|
|
|
/// represented as an indexed [r+r] operation. Returns false if it can
|
|
|
|
/// be represented by [r+imm], which are preferred.
|
2010-09-22 04:31:19 +08:00
|
|
|
bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) {
|
2014-05-22 09:07:24 +08:00
|
|
|
return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG);
|
2006-11-08 10:15:41 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-12-20 07:25:09 +08:00
|
|
|
/// SelectAddrIdxOnly - Given the specified addressed, force it to be
|
|
|
|
/// represented as an indexed [r+r] operation.
|
2010-09-22 04:31:19 +08:00
|
|
|
bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) {
|
2014-05-22 09:07:24 +08:00
|
|
|
return PPCLowering->SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
|
2006-11-08 10:15:41 +08:00
|
|
|
}
|
2005-08-22 06:31:09 +08:00
|
|
|
|
2013-05-17 01:58:02 +08:00
|
|
|
/// SelectAddrImmX4 - Returns true if the address N can be represented by
|
|
|
|
/// a base register plus a signed 16-bit displacement that is a multiple of 4.
|
|
|
|
/// Suitable for use by STD and friends.
|
|
|
|
bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
|
2014-05-22 09:07:24 +08:00
|
|
|
return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
|
2006-11-08 10:15:41 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2013-03-22 05:37:52 +08:00
|
|
|
// Select an address into a single register.
|
|
|
|
bool SelectAddr(SDValue N, SDValue &Base) {
|
|
|
|
Base = N;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2006-02-24 10:13:12 +08:00
|
|
|
/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
|
2009-08-18 08:18:39 +08:00
|
|
|
/// inline asm expressions. It is always correct to compute the value into
|
|
|
|
/// a register. The case of adding a (possibly relocatable) constant to a
|
|
|
|
/// register can be improved, but it is wrong to substitute Reg+Reg for
|
|
|
|
/// Reg in an asm, because the load or store opcode would have to change.
|
2014-12-04 07:40:13 +08:00
|
|
|
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
|
2015-03-13 20:45:09 +08:00
|
|
|
unsigned ConstraintID,
|
2014-04-29 15:57:37 +08:00
|
|
|
std::vector<SDValue> &OutOps) override {
|
2015-03-17 19:09:13 +08:00
|
|
|
|
|
|
|
switch(ConstraintID) {
|
|
|
|
default:
|
|
|
|
errs() << "ConstraintID: " << ConstraintID << "\n";
|
|
|
|
llvm_unreachable("Unexpected asm memory constraint");
|
|
|
|
case InlineAsm::Constraint_es:
|
2015-03-17 20:00:04 +08:00
|
|
|
case InlineAsm::Constraint_i:
|
2015-03-17 19:09:13 +08:00
|
|
|
case InlineAsm::Constraint_m:
|
|
|
|
case InlineAsm::Constraint_o:
|
|
|
|
case InlineAsm::Constraint_Q:
|
|
|
|
case InlineAsm::Constraint_Z:
|
|
|
|
case InlineAsm::Constraint_Zy:
|
|
|
|
// We need to make sure that this one operand does not end up in r0
|
|
|
|
// (because we might end up lowering this as 0(%op)).
|
|
|
|
const TargetRegisterInfo *TRI = PPCSubTarget->getRegisterInfo();
|
|
|
|
const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF, /*Kind=*/1);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDLoc dl(Op);
|
|
|
|
SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
|
2015-03-17 19:09:13 +08:00
|
|
|
SDValue NewOp =
|
|
|
|
SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
|
2015-04-28 22:05:47 +08:00
|
|
|
dl, Op.getValueType(),
|
2015-03-17 19:09:13 +08:00
|
|
|
Op, RC), 0);
|
|
|
|
|
|
|
|
OutOps.push_back(NewOp);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
2006-02-24 10:13:12 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2009-08-01 02:16:33 +08:00
|
|
|
void InsertVRSaveCode(MachineFunction &MF);
|
2006-03-17 02:25:23 +08:00
|
|
|
|
2014-04-29 15:57:37 +08:00
|
|
|
const char *getPassName() const override {
|
2005-08-18 03:33:03 +08:00
|
|
|
return "PowerPC DAG->DAG Pattern Instruction Selection";
|
2010-12-24 12:28:06 +08:00
|
|
|
}
|
|
|
|
|
2005-09-14 06:03:06 +08:00
|
|
|
// Include the pieces autogenerated from the target description.
|
2005-10-15 07:37:35 +08:00
|
|
|
#include "PPCGenDAGISel.inc"
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-10-07 02:45:51 +08:00
|
|
|
private:
|
2016-05-21 05:43:23 +08:00
|
|
|
bool trySETCC(SDNode *N);
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
|
|
|
|
void PeepholePPC64();
|
2014-12-13 07:59:36 +08:00
|
|
|
void PeepholePPC64ZExt();
|
2014-05-14 08:31:15 +08:00
|
|
|
void PeepholeCROps();
|
2014-02-28 14:11:16 +08:00
|
|
|
|
2015-01-03 09:16:37 +08:00
|
|
|
SDValue combineToCMPB(SDNode *N);
|
[PowerPC] Fold i1 extensions with other ops
Consider this function from our README.txt file:
int foo(int a, int b) { return (a < b) << 4; }
We now explicitly track CR bits by default, so the comment in the README.txt
about not really having a SETCC is no longer accurate, but we did generate this
somewhat silly code:
cmpw 0, 3, 4
li 3, 0
li 12, 1
isel 3, 12, 3, 0
sldi 3, 3, 4
blr
which generates the zext as a select between 0 and 1, and then shifts the
result by a constant amount. Here we preprocess the DAG in order to fold the
results of operations on an extension of an i1 value into the SELECT_I[48]
pseudo instruction when the resulting constant can be materialized using one
instruction (just like the 0 and 1). This was not implemented as a DAGCombine
because the resulting code would have been anti-canonical and depends on
replacing chained user nodes, which does not fit well into the lowering
paradigm. Now we generate:
cmpw 0, 3, 4
li 3, 0
li 12, 16
isel 3, 12, 3, 0
blr
which is less silly.
llvm-svn: 225203
2015-01-06 05:10:24 +08:00
|
|
|
void foldBoolExts(SDValue &Res, SDNode *&N);
|
2015-01-03 09:16:37 +08:00
|
|
|
|
2014-02-28 14:11:16 +08:00
|
|
|
bool AllUsersSelectZero(SDNode *N);
|
|
|
|
void SwapAllSelectUsers(SDNode *N);
|
[PowerPC] Make LDtocL and friends invariant loads
LDtocL, and other loads that roughly correspond to the TOC_ENTRY SDAG node,
represent loads from the TOC, which is invariant. As a result, these loads can
be hoisted out of loops, etc. In order to do this, we need to generate
GOT-style MMOs for TOC_ENTRY, which requires treating it as a legitimate memory
intrinsic node type. Once this is done, the MMO transfer is automatically
handled for TableGen-driven instruction selection, and for nodes generated
directly in PPCISelDAGToDAG, we need to transfer the MMOs manually.
Also, we were not transferring MMOs associated with pre-increment loads, so do
that too.
Lastly, this fixes an exposed bug where R30 was not added as a defined operand of
UpdateGBR.
This problem was highlighted by an example (used to generate the test case)
posted to llvmdev by Francois Pichet.
llvm-svn: 230553
2015-02-26 05:36:59 +08:00
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
void transferMemOperands(SDNode *N, SDNode *Result);
|
2005-08-18 03:33:03 +08:00
|
|
|
};
|
2015-06-23 17:49:53 +08:00
|
|
|
}
|
2005-08-18 03:33:03 +08:00
|
|
|
|
2006-03-17 02:25:23 +08:00
|
|
|
/// InsertVRSaveCode - Once the entire function has been instruction selected,
|
|
|
|
/// all virtual registers are created and all machine instructions are built,
|
|
|
|
/// check to see if we need to save/restore VRSAVE. If so, do it.
|
2009-08-01 02:16:33 +08:00
|
|
|
void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
// Check to see if this function uses vector registers, which means we have to
|
2010-12-24 12:28:06 +08:00
|
|
|
// save and restore the VRSAVE register and update it with the regs we use.
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
//
|
2010-02-11 00:03:48 +08:00
|
|
|
// In this case, there will be virtual registers of vector type created
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
// by the scheduler. Detect them now.
|
|
|
|
bool HasVectorVReg = false;
|
2011-01-09 07:11:11 +08:00
|
|
|
for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) {
|
|
|
|
unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
|
|
|
|
if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) {
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
HasVectorVReg = true;
|
|
|
|
break;
|
|
|
|
}
|
2011-01-09 07:11:11 +08:00
|
|
|
}
|
2006-03-17 02:25:23 +08:00
|
|
|
if (!HasVectorVReg) return; // nothing to do.
|
2010-12-24 12:28:06 +08:00
|
|
|
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
// If we have a vector register, we want to emit code into the entry and exit
|
|
|
|
// blocks to save and restore the VRSAVE register. We do this here (instead
|
|
|
|
// of marking all vector instructions as clobbering VRSAVE) for two reasons:
|
|
|
|
//
|
|
|
|
// 1. This (trivially) reduces the load on the register allocator, by not
|
|
|
|
// having to represent the live range of the VRSAVE register.
|
|
|
|
// 2. This (more significantly) allows us to create a temporary virtual
|
|
|
|
// register to hold the saved VRSAVE value, allowing this temporary to be
|
|
|
|
// register allocated, instead of forcing it to be spilled to the stack.
|
2006-03-17 02:25:23 +08:00
|
|
|
|
|
|
|
// Create two vregs - one to hold the VRSAVE register that is live-in to the
|
|
|
|
// function and one for the value after having bits or'd into it.
|
2007-12-31 12:13:23 +08:00
|
|
|
unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
|
|
|
|
unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2015-01-31 06:02:31 +08:00
|
|
|
const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo();
|
2006-03-17 02:25:23 +08:00
|
|
|
MachineBasicBlock &EntryBB = *Fn.begin();
|
2010-04-03 04:16:16 +08:00
|
|
|
DebugLoc dl;
|
2006-03-17 02:25:23 +08:00
|
|
|
// Emit the following code into the entry block:
|
|
|
|
// InVRSAVE = MFVRSAVE
|
|
|
|
// UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE
|
|
|
|
// MTVRSAVE UpdatedVRSAVE
|
|
|
|
MachineBasicBlock::iterator IP = EntryBB.begin(); // Insert Point
|
2009-02-13 10:27:39 +08:00
|
|
|
BuildMI(EntryBB, IP, dl, TII.get(PPC::MFVRSAVE), InVRSAVE);
|
|
|
|
BuildMI(EntryBB, IP, dl, TII.get(PPC::UPDATE_VRSAVE),
|
2008-01-07 09:56:04 +08:00
|
|
|
UpdatedVRSAVE).addReg(InVRSAVE);
|
2009-02-13 10:27:39 +08:00
|
|
|
BuildMI(EntryBB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-03-17 02:25:23 +08:00
|
|
|
// Find all return blocks, outputting a restore in each epilog.
|
|
|
|
for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
|
2015-09-26 05:25:19 +08:00
|
|
|
if (BB->isReturnBlock()) {
|
2006-03-17 02:25:23 +08:00
|
|
|
IP = BB->end(); --IP;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-03-17 02:25:23 +08:00
|
|
|
// Skip over all terminator instructions, which are part of the return
|
|
|
|
// sequence.
|
|
|
|
MachineBasicBlock::iterator I2 = IP;
|
2011-12-07 15:15:52 +08:00
|
|
|
while (I2 != BB->begin() && (--I2)->isTerminator())
|
2006-03-17 02:25:23 +08:00
|
|
|
IP = I2;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-03-17 02:25:23 +08:00
|
|
|
// Emit: MTVRSAVE InVRSave
|
2009-02-13 10:27:39 +08:00
|
|
|
BuildMI(*BB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE);
|
2010-12-24 12:28:06 +08:00
|
|
|
}
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
}
|
2005-10-07 02:45:51 +08:00
|
|
|
}
|
2005-09-03 09:17:22 +08:00
|
|
|
|
2006-03-17 02:25:23 +08:00
|
|
|
|
2005-08-20 06:38:53 +08:00
|
|
|
/// getGlobalBaseReg - Output the instructions required to put the
|
|
|
|
/// base address to use for accessing globals into a register.
|
|
|
|
///
|
2006-08-26 13:34:46 +08:00
|
|
|
SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
|
2005-08-20 06:38:53 +08:00
|
|
|
if (!GlobalBaseReg) {
|
2015-01-31 06:02:31 +08:00
|
|
|
const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo();
|
2005-08-20 06:38:53 +08:00
|
|
|
// Insert the set of GlobalBaseReg into the first MBB of the function
|
2009-08-15 10:07:36 +08:00
|
|
|
MachineBasicBlock &FirstMBB = MF->front();
|
2005-08-20 06:38:53 +08:00
|
|
|
MachineBasicBlock::iterator MBBI = FirstMBB.begin();
|
2014-11-12 23:16:30 +08:00
|
|
|
const Module *M = MF->getFunction()->getParent();
|
2010-04-03 04:16:16 +08:00
|
|
|
DebugLoc dl;
|
2006-06-27 08:04:13 +08:00
|
|
|
|
2015-07-09 10:09:04 +08:00
|
|
|
if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) {
|
2014-11-12 23:16:30 +08:00
|
|
|
if (PPCSubTarget->isTargetELF()) {
|
2014-07-19 07:29:49 +08:00
|
|
|
GlobalBaseReg = PPC::R30;
|
2016-06-18 02:07:14 +08:00
|
|
|
if (M->getPICLevel() == PICLevel::SmallPIC) {
|
2014-11-12 23:16:30 +08:00
|
|
|
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MoveGOTtoLR));
|
|
|
|
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
|
2015-01-08 23:47:19 +08:00
|
|
|
MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true);
|
2014-11-12 23:16:30 +08:00
|
|
|
} else {
|
|
|
|
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
|
|
|
|
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
|
|
|
|
unsigned TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
|
|
|
|
BuildMI(FirstMBB, MBBI, dl,
|
[PowerPC] Make LDtocL and friends invariant loads
LDtocL, and other loads that roughly correspond to the TOC_ENTRY SDAG node,
represent loads from the TOC, which is invariant. As a result, these loads can
be hoisted out of loops, etc. In order to do this, we need to generate
GOT-style MMOs for TOC_ENTRY, which requires treating it as a legitimate memory
intrinsic node type. Once this is done, the MMO transfer is automatically
handled for TableGen-driven instruction selection, and for nodes generated
directly in PPCISelDAGToDAG, we need to transfer the MMOs manually.
Also, we were not transferring MMOs associated with pre-increment loads, so do
that too.
Lastly, this fixes an exposed bug where R30 was not added as a defined operand of
UpdateGBR.
This problem was highlighted by an example (used to generate the test case)
posted to llvmdev by Francois Pichet.
llvm-svn: 230553
2015-02-26 05:36:59 +08:00
|
|
|
TII.get(PPC::UpdateGBR), GlobalBaseReg)
|
2014-11-12 23:16:30 +08:00
|
|
|
.addReg(TempReg, RegState::Define).addReg(GlobalBaseReg);
|
|
|
|
MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true);
|
|
|
|
}
|
|
|
|
} else {
|
2014-07-19 07:29:49 +08:00
|
|
|
GlobalBaseReg =
|
|
|
|
RegInfo->createVirtualRegister(&PPC::GPRC_NOR0RegClass);
|
2014-11-12 23:16:30 +08:00
|
|
|
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
|
|
|
|
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
|
2014-07-19 07:29:49 +08:00
|
|
|
}
|
2006-11-15 02:43:11 +08:00
|
|
|
} else {
|
2014-03-06 09:28:23 +08:00
|
|
|
GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RC_NOX0RegClass);
|
2011-05-19 10:56:28 +08:00
|
|
|
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8));
|
2009-02-13 10:27:39 +08:00
|
|
|
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
|
2006-11-15 02:43:11 +08:00
|
|
|
}
|
2005-08-20 06:38:53 +08:00
|
|
|
}
|
2008-08-31 23:37:04 +08:00
|
|
|
return CurDAG->getRegister(GlobalBaseReg,
|
2015-07-09 10:09:04 +08:00
|
|
|
PPCLowering->getPointerTy(CurDAG->getDataLayout()))
|
|
|
|
.getNode();
|
2005-08-20 06:38:53 +08:00
|
|
|
}
|
|
|
|
|
2006-06-27 08:04:13 +08:00
|
|
|
/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
|
|
|
|
/// or 64-bit immediate, and if the value can be accurately represented as a
|
|
|
|
/// sign extension from a 16-bit value. If so, this returns true and the
|
|
|
|
/// immediate.
|
|
|
|
static bool isIntS16Immediate(SDNode *N, short &Imm) {
|
|
|
|
if (N->getOpcode() != ISD::Constant)
|
|
|
|
return false;
|
2005-08-20 06:38:53 +08:00
|
|
|
|
2008-09-13 00:56:44 +08:00
|
|
|
Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getValueType(0) == MVT::i32)
|
2008-09-13 00:56:44 +08:00
|
|
|
return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
|
2006-06-27 08:04:13 +08:00
|
|
|
else
|
2008-09-13 00:56:44 +08:00
|
|
|
return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
|
2006-06-27 08:04:13 +08:00
|
|
|
}
|
|
|
|
|
2008-07-28 05:46:04 +08:00
|
|
|
static bool isIntS16Immediate(SDValue Op, short &Imm) {
|
2008-08-29 05:40:38 +08:00
|
|
|
return isIntS16Immediate(Op.getNode(), Imm);
|
2006-06-27 08:04:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
|
|
|
|
/// operand. If so Imm will receive the 32-bit value.
|
|
|
|
static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
|
2008-09-13 00:56:44 +08:00
|
|
|
Imm = cast<ConstantSDNode>(N)->getZExtValue();
|
2005-08-18 13:00:13 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2006-06-27 08:04:13 +08:00
|
|
|
/// isInt64Immediate - This method tests to see if the node is a 64-bit constant
|
|
|
|
/// operand. If so Imm will receive the 64-bit value.
|
|
|
|
static bool isInt64Immediate(SDNode *N, uint64_t &Imm) {
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) {
|
2008-09-13 00:56:44 +08:00
|
|
|
Imm = cast<ConstantSDNode>(N)->getZExtValue();
|
2006-06-27 08:04:13 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// isInt32Immediate - This method tests to see if a constant operand.
|
|
|
|
// If so Imm will receive the 32 bit value.
|
2008-07-28 05:46:04 +08:00
|
|
|
static bool isInt32Immediate(SDValue N, unsigned &Imm) {
|
2008-08-29 05:40:38 +08:00
|
|
|
return isInt32Immediate(N.getNode(), Imm);
|
2006-06-27 08:04:13 +08:00
|
|
|
}
|
|
|
|
|
2015-12-12 08:32:00 +08:00
|
|
|
static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo,
|
|
|
|
const SDValue &DestMBB) {
|
|
|
|
assert(isa<BasicBlockSDNode>(DestMBB));
|
|
|
|
|
|
|
|
if (!FuncInfo->BPI) return PPC::BR_NO_HINT;
|
|
|
|
|
|
|
|
const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
|
|
|
|
const TerminatorInst *BBTerm = BB->getTerminator();
|
|
|
|
|
|
|
|
if (BBTerm->getNumSuccessors() != 2) return PPC::BR_NO_HINT;
|
|
|
|
|
|
|
|
const BasicBlock *TBB = BBTerm->getSuccessor(0);
|
|
|
|
const BasicBlock *FBB = BBTerm->getSuccessor(1);
|
|
|
|
|
2015-12-23 02:56:14 +08:00
|
|
|
auto TProb = FuncInfo->BPI->getEdgeProbability(BB, TBB);
|
|
|
|
auto FProb = FuncInfo->BPI->getEdgeProbability(BB, FBB);
|
2015-12-12 08:32:00 +08:00
|
|
|
|
|
|
|
// We only want to handle cases which are easy to predict at static time, e.g.
|
|
|
|
// C++ throw statement, that is very likely not taken, or calling never
|
|
|
|
// returned function, e.g. stdlib exit(). So we set Threshold to filter
|
|
|
|
// unwanted cases.
|
|
|
|
//
|
|
|
|
// Below is LLVM branch weight table, we only want to handle case 1, 2
|
|
|
|
//
|
|
|
|
// Case Taken:Nontaken Example
|
|
|
|
// 1. Unreachable 1048575:1 C++ throw, stdlib exit(),
|
|
|
|
// 2. Invoke-terminating 1:1048575
|
|
|
|
// 3. Coldblock 4:64 __builtin_expect
|
|
|
|
// 4. Loop Branch 124:4 For loop
|
|
|
|
// 5. PH/ZH/FPH 20:12
|
|
|
|
const uint32_t Threshold = 10000;
|
|
|
|
|
2015-12-23 02:56:14 +08:00
|
|
|
if (std::max(TProb, FProb) / Threshold < std::min(TProb, FProb))
|
2015-12-12 08:32:00 +08:00
|
|
|
return PPC::BR_NO_HINT;
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "Use branch hint for '" << FuncInfo->Fn->getName() << "::"
|
|
|
|
<< BB->getName() << "'\n"
|
2015-12-23 02:56:14 +08:00
|
|
|
<< " -> " << TBB->getName() << ": " << TProb << "\n"
|
|
|
|
<< " -> " << FBB->getName() << ": " << FProb << "\n");
|
2015-12-12 08:32:00 +08:00
|
|
|
|
|
|
|
const BasicBlockSDNode *BBDN = cast<BasicBlockSDNode>(DestMBB);
|
|
|
|
|
2015-12-23 02:56:14 +08:00
|
|
|
// If Dest BasicBlock is False-BasicBlock (FBB), swap branch probabilities,
|
|
|
|
// because we want 'TProb' stands for 'branch probability' to Dest BasicBlock
|
2015-12-12 08:32:00 +08:00
|
|
|
if (BBDN->getBasicBlock()->getBasicBlock() != TBB)
|
2015-12-23 02:56:14 +08:00
|
|
|
std::swap(TProb, FProb);
|
2015-12-12 08:32:00 +08:00
|
|
|
|
2015-12-23 02:56:14 +08:00
|
|
|
return (TProb > FProb) ? PPC::BR_TAKEN_HINT : PPC::BR_NONTAKEN_HINT;
|
2015-12-12 08:32:00 +08:00
|
|
|
}
|
2006-06-27 08:04:13 +08:00
|
|
|
|
|
|
|
// isOpcWithIntImmediate - This method tests to see if the node is a specific
|
|
|
|
// opcode and that it has a immediate integer right operand.
|
|
|
|
// If so Imm will receive the 32 bit value.
|
|
|
|
static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
|
2008-08-31 23:37:04 +08:00
|
|
|
return N->getOpcode() == Opc
|
|
|
|
&& isInt32Immediate(N->getOperand(1).getNode(), Imm);
|
2006-06-27 08:04:13 +08:00
|
|
|
}
|
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
void PPCDAGToDAGISel::selectFrameIndex(SDNode *SN, SDNode *N, unsigned Offset) {
|
[PowerPC] Better lowering for add/or of a FrameIndex
If we have an add (or an or that is really an add), where one operand is a
FrameIndex and the other operand is a small constant, we can combine the
lowering of the FrameIndex (which is lowered as an add of the FI and a zero
offset) with the constant operand.
Amusingly, this is an old potential improvement entry from
lib/Target/PowerPC/README.txt which had never been resolved. In short, we used
to lower:
%X = alloca { i32, i32 }
%Y = getelementptr {i32,i32}* %X, i32 0, i32 1
ret i32* %Y
as:
addi 3, 1, -8
ori 3, 3, 4
blr
and now we produce:
addi 3, 1, -4
blr
which is much more sensible.
llvm-svn: 224071
2014-12-12 06:51:06 +08:00
|
|
|
SDLoc dl(SN);
|
|
|
|
int FI = cast<FrameIndexSDNode>(N)->getIndex();
|
|
|
|
SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
|
|
|
|
unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8;
|
|
|
|
if (SN->hasOneUse())
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(SN, Opc, N->getValueType(0), TFI,
|
|
|
|
getSmallIPtrImm(Offset, dl));
|
|
|
|
else
|
|
|
|
ReplaceNode(SN, CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI,
|
|
|
|
getSmallIPtrImm(Offset, dl)));
|
[PowerPC] Better lowering for add/or of a FrameIndex
If we have an add (or an or that is really an add), where one operand is a
FrameIndex and the other operand is a small constant, we can combine the
lowering of the FrameIndex (which is lowered as an add of the FI and a zero
offset) with the constant operand.
Amusingly, this is an old potential improvement entry from
lib/Target/PowerPC/README.txt which had never been resolved. In short, we used
to lower:
%X = alloca { i32, i32 }
%Y = getelementptr {i32,i32}* %X, i32 0, i32 1
ret i32* %Y
as:
addi 3, 1, -8
ori 3, 3, 4
blr
and now we produce:
addi 3, 1, -4
blr
which is much more sensible.
llvm-svn: 224071
2014-12-12 06:51:06 +08:00
|
|
|
}
|
|
|
|
|
2010-12-24 12:28:06 +08:00
|
|
|
bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask,
|
|
|
|
bool isShiftMask, unsigned &SH,
|
2006-09-22 13:01:56 +08:00
|
|
|
unsigned &MB, unsigned &ME) {
|
2005-10-19 08:05:37 +08:00
|
|
|
// Don't even go down this path for i64, since different logic will be
|
|
|
|
// necessary for rldicl/rldicr/rldimi.
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getValueType(0) != MVT::i32)
|
2005-10-19 08:05:37 +08:00
|
|
|
return false;
|
|
|
|
|
2005-08-18 15:30:46 +08:00
|
|
|
unsigned Shift = 32;
|
|
|
|
unsigned Indeterminant = ~0; // bit mask marking indeterminant results
|
|
|
|
unsigned Opcode = N->getOpcode();
|
2005-08-30 08:59:16 +08:00
|
|
|
if (N->getNumOperands() != 2 ||
|
2008-08-29 05:40:38 +08:00
|
|
|
!isInt32Immediate(N->getOperand(1).getNode(), Shift) || (Shift > 31))
|
2005-08-18 15:30:46 +08:00
|
|
|
return false;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-08-18 15:30:46 +08:00
|
|
|
if (Opcode == ISD::SHL) {
|
|
|
|
// apply shift left to mask if it comes first
|
2009-11-24 09:09:07 +08:00
|
|
|
if (isShiftMask) Mask = Mask << Shift;
|
2005-08-18 15:30:46 +08:00
|
|
|
// determine which bits are made indeterminant by shift
|
|
|
|
Indeterminant = ~(0xFFFFFFFFu << Shift);
|
2010-12-24 12:28:06 +08:00
|
|
|
} else if (Opcode == ISD::SRL) {
|
2005-08-18 15:30:46 +08:00
|
|
|
// apply shift right to mask if it comes first
|
2009-11-24 09:09:07 +08:00
|
|
|
if (isShiftMask) Mask = Mask >> Shift;
|
2005-08-18 15:30:46 +08:00
|
|
|
// determine which bits are made indeterminant by shift
|
|
|
|
Indeterminant = ~(0xFFFFFFFFu >> Shift);
|
|
|
|
// adjust for the left rotate
|
|
|
|
Shift = 32 - Shift;
|
2006-09-22 13:01:56 +08:00
|
|
|
} else if (Opcode == ISD::ROTL) {
|
|
|
|
Indeterminant = 0;
|
2005-08-18 15:30:46 +08:00
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-08-18 15:30:46 +08:00
|
|
|
// if the mask doesn't intersect any Indeterminant bits
|
|
|
|
if (Mask && !(Mask & Indeterminant)) {
|
2006-05-13 00:29:37 +08:00
|
|
|
SH = Shift & 31;
|
2005-08-18 15:30:46 +08:00
|
|
|
// make sure the mask is still a mask (wrap arounds may not be)
|
|
|
|
return isRunOfOnes(Mask, MB, ME);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
/// Turn an or of two masked values into the rotate left word immediate then
|
|
|
|
/// mask insert (rlwimi) instruction.
|
|
|
|
bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Op0 = N->getOperand(0);
|
|
|
|
SDValue Op1 = N->getOperand(1);
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(N);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2008-02-27 09:23:58 +08:00
|
|
|
APInt LKZ, LKO, RKZ, RKO;
|
2014-05-15 05:14:37 +08:00
|
|
|
CurDAG->computeKnownBits(Op0, LKZ, LKO);
|
|
|
|
CurDAG->computeKnownBits(Op1, RKZ, RKO);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2008-02-27 09:23:58 +08:00
|
|
|
unsigned TargetMask = LKZ.getZExtValue();
|
|
|
|
unsigned InsertMask = RKZ.getZExtValue();
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-05-09 01:38:32 +08:00
|
|
|
if ((TargetMask | InsertMask) == 0xFFFFFFFF) {
|
|
|
|
unsigned Op0Opc = Op0.getOpcode();
|
|
|
|
unsigned Op1Opc = Op1.getOpcode();
|
|
|
|
unsigned Value, SH = 0;
|
|
|
|
TargetMask = ~TargetMask;
|
|
|
|
InsertMask = ~InsertMask;
|
2006-05-07 08:23:38 +08:00
|
|
|
|
2006-05-09 01:38:32 +08:00
|
|
|
// If the LHS has a foldable shift and the RHS does not, then swap it to the
|
|
|
|
// RHS so that we can fold the shift into the insert.
|
2006-05-07 08:23:38 +08:00
|
|
|
if (Op0Opc == ISD::AND && Op1Opc == ISD::AND) {
|
|
|
|
if (Op0.getOperand(0).getOpcode() == ISD::SHL ||
|
|
|
|
Op0.getOperand(0).getOpcode() == ISD::SRL) {
|
|
|
|
if (Op1.getOperand(0).getOpcode() != ISD::SHL &&
|
|
|
|
Op1.getOperand(0).getOpcode() != ISD::SRL) {
|
|
|
|
std::swap(Op0, Op1);
|
|
|
|
std::swap(Op0Opc, Op1Opc);
|
2006-05-09 01:38:32 +08:00
|
|
|
std::swap(TargetMask, InsertMask);
|
2006-05-07 08:23:38 +08:00
|
|
|
}
|
|
|
|
}
|
2006-05-09 01:38:32 +08:00
|
|
|
} else if (Op0Opc == ISD::SHL || Op0Opc == ISD::SRL) {
|
|
|
|
if (Op1Opc == ISD::AND && Op1.getOperand(0).getOpcode() != ISD::SHL &&
|
|
|
|
Op1.getOperand(0).getOpcode() != ISD::SRL) {
|
|
|
|
std::swap(Op0, Op1);
|
|
|
|
std::swap(Op0Opc, Op1Opc);
|
|
|
|
std::swap(TargetMask, InsertMask);
|
|
|
|
}
|
2005-08-19 08:38:14 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-05-07 08:23:38 +08:00
|
|
|
unsigned MB, ME;
|
2013-07-12 00:31:51 +08:00
|
|
|
if (isRunOfOnes(InsertMask, MB, ME)) {
|
2009-11-21 06:16:40 +08:00
|
|
|
SDValue Tmp1, Tmp2;
|
2006-05-07 08:23:38 +08:00
|
|
|
|
|
|
|
if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) &&
|
2006-06-27 08:04:13 +08:00
|
|
|
isInt32Immediate(Op1.getOperand(1), Value)) {
|
2006-05-07 08:23:38 +08:00
|
|
|
Op1 = Op1.getOperand(0);
|
|
|
|
SH = (Op1Opc == ISD::SHL) ? Value : 32 - Value;
|
2005-08-19 08:38:14 +08:00
|
|
|
}
|
2006-05-07 08:23:38 +08:00
|
|
|
if (Op1Opc == ISD::AND) {
|
2014-04-14 01:10:58 +08:00
|
|
|
// The AND mask might not be a constant, and we need to make sure that
|
|
|
|
// if we're going to fold the masking with the insert, all bits not
|
|
|
|
// know to be zero in the mask are known to be one.
|
|
|
|
APInt MKZ, MKO;
|
2014-05-15 05:14:37 +08:00
|
|
|
CurDAG->computeKnownBits(Op1.getOperand(1), MKZ, MKO);
|
2014-04-14 01:10:58 +08:00
|
|
|
bool CanFoldMask = InsertMask == MKO.getZExtValue();
|
|
|
|
|
2006-05-07 08:23:38 +08:00
|
|
|
unsigned SHOpc = Op1.getOperand(0).getOpcode();
|
2014-04-14 01:10:58 +08:00
|
|
|
if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) && CanFoldMask &&
|
2006-06-27 08:04:13 +08:00
|
|
|
isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) {
|
2014-05-14 08:31:15 +08:00
|
|
|
// Note that Value must be in range here (less than 32) because
|
|
|
|
// otherwise there would not be any bits set in InsertMask.
|
2006-05-07 08:23:38 +08:00
|
|
|
Op1 = Op1.getOperand(0).getOperand(0);
|
|
|
|
SH = (SHOpc == ISD::SHL) ? Value : 32 - Value;
|
|
|
|
}
|
2005-08-19 08:38:14 +08:00
|
|
|
}
|
2009-11-21 06:16:40 +08:00
|
|
|
|
2006-05-13 00:29:37 +08:00
|
|
|
SH &= 31;
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Ops[] = { Op0, Op1, getI32Imm(SH, dl), getI32Imm(MB, dl),
|
|
|
|
getI32Imm(ME, dl) };
|
2016-05-21 05:43:23 +08:00
|
|
|
ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops));
|
|
|
|
return true;
|
2005-08-19 08:38:14 +08:00
|
|
|
}
|
|
|
|
}
|
2016-05-21 05:43:23 +08:00
|
|
|
return false;
|
2005-08-19 08:38:14 +08:00
|
|
|
}
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
// Predict the number of instructions that would be generated by calling
|
2016-05-21 05:43:23 +08:00
|
|
|
// getInt64(N).
|
|
|
|
static unsigned getInt64CountDirect(int64_t Imm) {
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
// Assume no remaining bits.
|
|
|
|
unsigned Remainder = 0;
|
|
|
|
// Assume no shift required.
|
|
|
|
unsigned Shift = 0;
|
|
|
|
|
|
|
|
// If it can't be represented as a 32 bit value.
|
|
|
|
if (!isInt<32>(Imm)) {
|
|
|
|
Shift = countTrailingZeros<uint64_t>(Imm);
|
|
|
|
int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
|
|
|
|
|
|
|
|
// If the shifted value fits 32 bits.
|
|
|
|
if (isInt<32>(ImmSh)) {
|
|
|
|
// Go with the shifted value.
|
|
|
|
Imm = ImmSh;
|
|
|
|
} else {
|
|
|
|
// Still stuck with a 64 bit value.
|
|
|
|
Remainder = Imm;
|
|
|
|
Shift = 32;
|
|
|
|
Imm >>= 32;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Intermediate operand.
|
|
|
|
unsigned Result = 0;
|
|
|
|
|
|
|
|
// Handle first 32 bits.
|
|
|
|
unsigned Lo = Imm & 0xFFFF;
|
|
|
|
|
|
|
|
// Simple value.
|
|
|
|
if (isInt<16>(Imm)) {
|
|
|
|
// Just the Lo bits.
|
|
|
|
++Result;
|
|
|
|
} else if (Lo) {
|
|
|
|
// Handle the Hi bits and Lo bits.
|
|
|
|
Result += 2;
|
|
|
|
} else {
|
|
|
|
// Just the Hi bits.
|
|
|
|
++Result;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If no shift, we're done.
|
|
|
|
if (!Shift) return Result;
|
|
|
|
|
|
|
|
// Shift for next step if the upper 32-bits were not zero.
|
|
|
|
if (Imm)
|
|
|
|
++Result;
|
|
|
|
|
|
|
|
// Add in the last bits as required.
|
2015-11-10 20:29:37 +08:00
|
|
|
if ((Remainder >> 16) & 0xFFFF)
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
++Result;
|
2015-11-10 20:29:37 +08:00
|
|
|
if (Remainder & 0xFFFF)
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
++Result;
|
|
|
|
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
2015-01-04 23:43:55 +08:00
|
|
|
static uint64_t Rot64(uint64_t Imm, unsigned R) {
|
|
|
|
return (Imm << R) | (Imm >> (64 - R));
|
|
|
|
}
|
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
static unsigned getInt64Count(int64_t Imm) {
|
|
|
|
unsigned Count = getInt64CountDirect(Imm);
|
2015-01-05 11:41:38 +08:00
|
|
|
if (Count == 1)
|
|
|
|
return Count;
|
2015-01-04 20:35:03 +08:00
|
|
|
|
2015-01-04 23:43:55 +08:00
|
|
|
for (unsigned r = 1; r < 63; ++r) {
|
2015-01-05 11:41:38 +08:00
|
|
|
uint64_t RImm = Rot64(Imm, r);
|
2016-05-21 05:43:23 +08:00
|
|
|
unsigned RCount = getInt64CountDirect(RImm) + 1;
|
2015-01-05 11:41:38 +08:00
|
|
|
Count = std::min(Count, RCount);
|
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
// See comments in getInt64 for an explanation of the logic below.
|
2015-01-05 11:41:38 +08:00
|
|
|
unsigned LS = findLastSet(RImm);
|
|
|
|
if (LS != r-1)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
|
|
|
|
uint64_t RImmWithOnes = RImm | OnesMask;
|
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
RCount = getInt64CountDirect(RImmWithOnes) + 1;
|
2015-01-04 23:43:55 +08:00
|
|
|
Count = std::min(Count, RCount);
|
|
|
|
}
|
2015-01-04 20:35:03 +08:00
|
|
|
|
2015-01-04 23:43:55 +08:00
|
|
|
return Count;
|
2015-01-04 20:35:03 +08:00
|
|
|
}
|
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
// Select a 64-bit constant. For cost-modeling purposes, getInt64Count
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
// (above) needs to be kept in sync with this function.
|
2016-06-12 23:39:02 +08:00
|
|
|
static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl,
|
|
|
|
int64_t Imm) {
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
// Assume no remaining bits.
|
|
|
|
unsigned Remainder = 0;
|
|
|
|
// Assume no shift required.
|
|
|
|
unsigned Shift = 0;
|
|
|
|
|
|
|
|
// If it can't be represented as a 32 bit value.
|
|
|
|
if (!isInt<32>(Imm)) {
|
|
|
|
Shift = countTrailingZeros<uint64_t>(Imm);
|
|
|
|
int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
|
|
|
|
|
|
|
|
// If the shifted value fits 32 bits.
|
|
|
|
if (isInt<32>(ImmSh)) {
|
|
|
|
// Go with the shifted value.
|
|
|
|
Imm = ImmSh;
|
|
|
|
} else {
|
|
|
|
// Still stuck with a 64 bit value.
|
|
|
|
Remainder = Imm;
|
|
|
|
Shift = 32;
|
|
|
|
Imm >>= 32;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Intermediate operand.
|
|
|
|
SDNode *Result;
|
|
|
|
|
|
|
|
// Handle first 32 bits.
|
|
|
|
unsigned Lo = Imm & 0xFFFF;
|
|
|
|
unsigned Hi = (Imm >> 16) & 0xFFFF;
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
auto getI32Imm = [CurDAG, dl](unsigned Imm) {
|
|
|
|
return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
// Simple value.
|
|
|
|
if (isInt<16>(Imm)) {
|
|
|
|
// Just the Lo bits.
|
|
|
|
Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
|
|
|
|
} else if (Lo) {
|
|
|
|
// Handle the Hi bits.
|
|
|
|
unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
|
|
|
|
Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
|
|
|
|
// And Lo bits.
|
|
|
|
Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
|
|
|
|
SDValue(Result, 0), getI32Imm(Lo));
|
|
|
|
} else {
|
|
|
|
// Just the Hi bits.
|
|
|
|
Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
|
|
|
|
}
|
|
|
|
|
|
|
|
// If no shift, we're done.
|
|
|
|
if (!Shift) return Result;
|
|
|
|
|
|
|
|
// Shift for next step if the upper 32-bits were not zero.
|
|
|
|
if (Imm) {
|
|
|
|
Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
|
|
|
|
SDValue(Result, 0),
|
|
|
|
getI32Imm(Shift),
|
|
|
|
getI32Imm(63 - Shift));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add in the last bits as required.
|
|
|
|
if ((Hi = (Remainder >> 16) & 0xFFFF)) {
|
|
|
|
Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
|
|
|
|
SDValue(Result, 0), getI32Imm(Hi));
|
|
|
|
}
|
|
|
|
if ((Lo = Remainder & 0xFFFF)) {
|
|
|
|
Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
|
|
|
|
SDValue(Result, 0), getI32Imm(Lo));
|
|
|
|
}
|
|
|
|
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
2016-06-12 23:39:02 +08:00
|
|
|
static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) {
|
2016-05-21 05:43:23 +08:00
|
|
|
unsigned Count = getInt64CountDirect(Imm);
|
2015-01-05 11:41:38 +08:00
|
|
|
if (Count == 1)
|
2016-05-21 05:43:23 +08:00
|
|
|
return getInt64Direct(CurDAG, dl, Imm);
|
2015-01-05 11:41:38 +08:00
|
|
|
|
2015-01-04 23:43:55 +08:00
|
|
|
unsigned RMin = 0;
|
|
|
|
|
2015-01-05 11:41:38 +08:00
|
|
|
int64_t MatImm;
|
|
|
|
unsigned MaskEnd;
|
|
|
|
|
2015-01-04 23:43:55 +08:00
|
|
|
for (unsigned r = 1; r < 63; ++r) {
|
2015-01-05 11:41:38 +08:00
|
|
|
uint64_t RImm = Rot64(Imm, r);
|
2016-05-21 05:43:23 +08:00
|
|
|
unsigned RCount = getInt64CountDirect(RImm) + 1;
|
2015-01-05 11:41:38 +08:00
|
|
|
if (RCount < Count) {
|
|
|
|
Count = RCount;
|
|
|
|
RMin = r;
|
|
|
|
MatImm = RImm;
|
|
|
|
MaskEnd = 63;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the immediate to generate has many trailing zeros, it might be
|
|
|
|
// worthwhile to generate a rotated value with too many leading ones
|
|
|
|
// (because that's free with li/lis's sign-extension semantics), and then
|
|
|
|
// mask them off after rotation.
|
|
|
|
|
|
|
|
unsigned LS = findLastSet(RImm);
|
|
|
|
// We're adding (63-LS) higher-order ones, and we expect to mask them off
|
|
|
|
// after performing the inverse rotation by (64-r). So we need that:
|
|
|
|
// 63-LS == 64-r => LS == r-1
|
|
|
|
if (LS != r-1)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
|
|
|
|
uint64_t RImmWithOnes = RImm | OnesMask;
|
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
RCount = getInt64CountDirect(RImmWithOnes) + 1;
|
2015-01-04 23:43:55 +08:00
|
|
|
if (RCount < Count) {
|
|
|
|
Count = RCount;
|
|
|
|
RMin = r;
|
2015-01-05 11:41:38 +08:00
|
|
|
MatImm = RImmWithOnes;
|
|
|
|
MaskEnd = LS;
|
2015-01-04 23:43:55 +08:00
|
|
|
}
|
2015-01-04 20:35:03 +08:00
|
|
|
}
|
|
|
|
|
2015-01-04 23:43:55 +08:00
|
|
|
if (!RMin)
|
2016-05-21 05:43:23 +08:00
|
|
|
return getInt64Direct(CurDAG, dl, Imm);
|
2015-01-04 23:43:55 +08:00
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
auto getI32Imm = [CurDAG, dl](unsigned Imm) {
|
|
|
|
return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
|
2015-01-04 23:43:55 +08:00
|
|
|
};
|
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
SDValue Val = SDValue(getInt64Direct(CurDAG, dl, MatImm), 0);
|
2015-01-05 11:41:38 +08:00
|
|
|
return CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Val,
|
|
|
|
getI32Imm(64 - RMin), getI32Imm(MaskEnd));
|
2015-01-04 20:35:03 +08:00
|
|
|
}
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
// Select a 64-bit constant.
|
2016-05-21 05:43:23 +08:00
|
|
|
static SDNode *getInt64(SelectionDAG *CurDAG, SDNode *N) {
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
SDLoc dl(N);
|
|
|
|
|
|
|
|
// Get 64 bit value.
|
|
|
|
int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
|
2016-05-21 05:43:23 +08:00
|
|
|
return getInt64(CurDAG, dl, Imm);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
}
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
namespace {
|
|
|
|
class BitPermutationSelector {
|
|
|
|
struct ValueBit {
|
|
|
|
SDValue V;
|
|
|
|
|
|
|
|
// The bit number in the value, using a convention where bit 0 is the
|
|
|
|
// lowest-order bit.
|
|
|
|
unsigned Idx;
|
|
|
|
|
|
|
|
enum Kind {
|
|
|
|
ConstZero,
|
|
|
|
Variable
|
|
|
|
} K;
|
|
|
|
|
|
|
|
ValueBit(SDValue V, unsigned I, Kind K = Variable)
|
|
|
|
: V(V), Idx(I), K(K) {}
|
|
|
|
ValueBit(Kind K = Variable)
|
|
|
|
: V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {}
|
|
|
|
|
|
|
|
bool isZero() const {
|
|
|
|
return K == ConstZero;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool hasValue() const {
|
|
|
|
return K == Variable;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue getValue() const {
|
|
|
|
assert(hasValue() && "Cannot get the value of a constant bit");
|
|
|
|
return V;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned getValueBitIndex() const {
|
|
|
|
assert(hasValue() && "Cannot get the value bit index of a constant bit");
|
|
|
|
return Idx;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// A bit group has the same underlying value and the same rotate factor.
|
|
|
|
struct BitGroup {
|
|
|
|
SDValue V;
|
|
|
|
unsigned RLAmt;
|
|
|
|
unsigned StartIdx, EndIdx;
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
// This rotation amount assumes that the lower 32 bits of the quantity are
|
|
|
|
// replicated in the high 32 bits by the rotation operator (which is done
|
|
|
|
// by rlwinm and friends in 64-bit mode).
|
|
|
|
bool Repl32;
|
|
|
|
// Did converting to Repl32 == true change the rotation factor? If it did,
|
|
|
|
// it decreased it by 32.
|
|
|
|
bool Repl32CR;
|
|
|
|
// Was this group coalesced after setting Repl32 to true?
|
|
|
|
bool Repl32Coalesced;
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
BitGroup(SDValue V, unsigned R, unsigned S, unsigned E)
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
: V(V), RLAmt(R), StartIdx(S), EndIdx(E), Repl32(false), Repl32CR(false),
|
|
|
|
Repl32Coalesced(false) {
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R <<
|
|
|
|
" [" << S << ", " << E << "]\n");
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Information on each (Value, RLAmt) pair (like the number of groups
|
|
|
|
// associated with each) used to choose the lowering method.
|
|
|
|
struct ValueRotInfo {
|
|
|
|
SDValue V;
|
|
|
|
unsigned RLAmt;
|
|
|
|
unsigned NumGroups;
|
|
|
|
unsigned FirstGroupStartIdx;
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
bool Repl32;
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
|
|
|
|
ValueRotInfo()
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
: RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX),
|
|
|
|
Repl32(false) {}
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
|
|
|
|
// For sorting (in reverse order) by NumGroups, and then by
|
|
|
|
// FirstGroupStartIdx.
|
|
|
|
bool operator < (const ValueRotInfo &Other) const {
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
// We need to sort so that the non-Repl32 come first because, when we're
|
|
|
|
// doing masking, the Repl32 bit groups might be subsumed into the 64-bit
|
|
|
|
// masking operation.
|
|
|
|
if (Repl32 < Other.Repl32)
|
|
|
|
return true;
|
|
|
|
else if (Repl32 > Other.Repl32)
|
|
|
|
return false;
|
|
|
|
else if (NumGroups > Other.NumGroups)
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
return true;
|
|
|
|
else if (NumGroups < Other.NumGroups)
|
|
|
|
return false;
|
|
|
|
else if (FirstGroupStartIdx < Other.FirstGroupStartIdx)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Return true if something interesting was deduced, return false if we're
|
|
|
|
// providing only a generic representation of V (or something else likewise
|
|
|
|
// uninteresting for instruction selection).
|
|
|
|
bool getValueBits(SDValue V, SmallVector<ValueBit, 64> &Bits) {
|
|
|
|
switch (V.getOpcode()) {
|
|
|
|
default: break;
|
|
|
|
case ISD::ROTL:
|
|
|
|
if (isa<ConstantSDNode>(V.getOperand(1))) {
|
|
|
|
unsigned RotAmt = V.getConstantOperandVal(1);
|
|
|
|
|
|
|
|
SmallVector<ValueBit, 64> LHSBits(Bits.size());
|
|
|
|
getValueBits(V.getOperand(0), LHSBits);
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < Bits.size(); ++i)
|
|
|
|
Bits[i] = LHSBits[i < RotAmt ? i + (Bits.size() - RotAmt) : i - RotAmt];
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case ISD::SHL:
|
|
|
|
if (isa<ConstantSDNode>(V.getOperand(1))) {
|
|
|
|
unsigned ShiftAmt = V.getConstantOperandVal(1);
|
|
|
|
|
|
|
|
SmallVector<ValueBit, 64> LHSBits(Bits.size());
|
|
|
|
getValueBits(V.getOperand(0), LHSBits);
|
|
|
|
|
|
|
|
for (unsigned i = ShiftAmt; i < Bits.size(); ++i)
|
|
|
|
Bits[i] = LHSBits[i - ShiftAmt];
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < ShiftAmt; ++i)
|
|
|
|
Bits[i] = ValueBit(ValueBit::ConstZero);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case ISD::SRL:
|
|
|
|
if (isa<ConstantSDNode>(V.getOperand(1))) {
|
|
|
|
unsigned ShiftAmt = V.getConstantOperandVal(1);
|
|
|
|
|
|
|
|
SmallVector<ValueBit, 64> LHSBits(Bits.size());
|
|
|
|
getValueBits(V.getOperand(0), LHSBits);
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < Bits.size() - ShiftAmt; ++i)
|
|
|
|
Bits[i] = LHSBits[i + ShiftAmt];
|
|
|
|
|
|
|
|
for (unsigned i = Bits.size() - ShiftAmt; i < Bits.size(); ++i)
|
|
|
|
Bits[i] = ValueBit(ValueBit::ConstZero);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case ISD::AND:
|
|
|
|
if (isa<ConstantSDNode>(V.getOperand(1))) {
|
|
|
|
uint64_t Mask = V.getConstantOperandVal(1);
|
|
|
|
|
|
|
|
SmallVector<ValueBit, 64> LHSBits(Bits.size());
|
|
|
|
bool LHSTrivial = getValueBits(V.getOperand(0), LHSBits);
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < Bits.size(); ++i)
|
|
|
|
if (((Mask >> i) & 1) == 1)
|
|
|
|
Bits[i] = LHSBits[i];
|
|
|
|
else
|
|
|
|
Bits[i] = ValueBit(ValueBit::ConstZero);
|
|
|
|
|
|
|
|
// Mark this as interesting, only if the LHS was also interesting. This
|
|
|
|
// prevents the overall procedure from matching a single immediate 'and'
|
|
|
|
// (which is non-optimal because such an and might be folded with other
|
|
|
|
// things if we don't select it here).
|
|
|
|
return LHSTrivial;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case ISD::OR: {
|
|
|
|
SmallVector<ValueBit, 64> LHSBits(Bits.size()), RHSBits(Bits.size());
|
|
|
|
getValueBits(V.getOperand(0), LHSBits);
|
|
|
|
getValueBits(V.getOperand(1), RHSBits);
|
|
|
|
|
|
|
|
bool AllDisjoint = true;
|
|
|
|
for (unsigned i = 0; i < Bits.size(); ++i)
|
|
|
|
if (LHSBits[i].isZero())
|
|
|
|
Bits[i] = RHSBits[i];
|
|
|
|
else if (RHSBits[i].isZero())
|
|
|
|
Bits[i] = LHSBits[i];
|
|
|
|
else {
|
|
|
|
AllDisjoint = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!AllDisjoint)
|
|
|
|
break;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < Bits.size(); ++i)
|
|
|
|
Bits[i] = ValueBit(V, i);
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// For each value (except the constant ones), compute the left-rotate amount
|
|
|
|
// to get it from its original to final position.
|
|
|
|
void computeRotationAmounts() {
|
|
|
|
HasZeros = false;
|
|
|
|
RLAmt.resize(Bits.size());
|
|
|
|
for (unsigned i = 0; i < Bits.size(); ++i)
|
|
|
|
if (Bits[i].hasValue()) {
|
|
|
|
unsigned VBI = Bits[i].getValueBitIndex();
|
|
|
|
if (i >= VBI)
|
|
|
|
RLAmt[i] = i - VBI;
|
|
|
|
else
|
|
|
|
RLAmt[i] = Bits.size() - (VBI - i);
|
|
|
|
} else if (Bits[i].isZero()) {
|
|
|
|
HasZeros = true;
|
|
|
|
RLAmt[i] = UINT32_MAX;
|
|
|
|
} else {
|
|
|
|
llvm_unreachable("Unknown value bit type");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Collect groups of consecutive bits with the same underlying value and
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
// rotation factor. If we're doing late masking, we ignore zeros, otherwise
|
|
|
|
// they break up groups.
|
|
|
|
void collectBitGroups(bool LateMask) {
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
BitGroups.clear();
|
|
|
|
|
|
|
|
unsigned LastRLAmt = RLAmt[0];
|
|
|
|
SDValue LastValue = Bits[0].hasValue() ? Bits[0].getValue() : SDValue();
|
|
|
|
unsigned LastGroupStartIdx = 0;
|
|
|
|
for (unsigned i = 1; i < Bits.size(); ++i) {
|
|
|
|
unsigned ThisRLAmt = RLAmt[i];
|
|
|
|
SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue();
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if (LateMask && !ThisValue) {
|
|
|
|
ThisValue = LastValue;
|
|
|
|
ThisRLAmt = LastRLAmt;
|
|
|
|
// If we're doing late masking, then the first bit group always starts
|
|
|
|
// at zero (even if the first bits were zero).
|
|
|
|
if (BitGroups.empty())
|
|
|
|
LastGroupStartIdx = 0;
|
|
|
|
}
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
|
|
|
|
// If this bit has the same underlying value and the same rotate factor as
|
|
|
|
// the last one, then they're part of the same group.
|
|
|
|
if (ThisRLAmt == LastRLAmt && ThisValue == LastValue)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (LastValue.getNode())
|
|
|
|
BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
|
|
|
|
i-1));
|
|
|
|
LastRLAmt = ThisRLAmt;
|
|
|
|
LastValue = ThisValue;
|
|
|
|
LastGroupStartIdx = i;
|
|
|
|
}
|
|
|
|
if (LastValue.getNode())
|
|
|
|
BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
|
|
|
|
Bits.size()-1));
|
|
|
|
|
|
|
|
if (BitGroups.empty())
|
|
|
|
return;
|
|
|
|
|
|
|
|
// We might be able to combine the first and last groups.
|
|
|
|
if (BitGroups.size() > 1) {
|
|
|
|
// If the first and last groups are the same, then remove the first group
|
|
|
|
// in favor of the last group, making the ending index of the last group
|
|
|
|
// equal to the ending index of the to-be-removed first group.
|
|
|
|
if (BitGroups[0].StartIdx == 0 &&
|
|
|
|
BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 &&
|
|
|
|
BitGroups[0].V == BitGroups[BitGroups.size()-1].V &&
|
|
|
|
BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) {
|
2015-08-09 02:27:36 +08:00
|
|
|
DEBUG(dbgs() << "\tcombining final bit group with initial one\n");
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx;
|
|
|
|
BitGroups.erase(BitGroups.begin());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Take all (SDValue, RLAmt) pairs and sort them by the number of groups
|
|
|
|
// associated with each. If there is a degeneracy, pick the one that occurs
|
|
|
|
// first (in the final value).
|
|
|
|
void collectValueRotInfo() {
|
|
|
|
ValueRots.clear();
|
|
|
|
|
|
|
|
for (auto &BG : BitGroups) {
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
unsigned RLAmtKey = BG.RLAmt + (BG.Repl32 ? 64 : 0);
|
|
|
|
ValueRotInfo &VRI = ValueRots[std::make_pair(BG.V, RLAmtKey)];
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
VRI.V = BG.V;
|
|
|
|
VRI.RLAmt = BG.RLAmt;
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
VRI.Repl32 = BG.Repl32;
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
VRI.NumGroups += 1;
|
|
|
|
VRI.FirstGroupStartIdx = std::min(VRI.FirstGroupStartIdx, BG.StartIdx);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now that we've collected the various ValueRotInfo instances, we need to
|
|
|
|
// sort them.
|
|
|
|
ValueRotsVec.clear();
|
|
|
|
for (auto &I : ValueRots) {
|
|
|
|
ValueRotsVec.push_back(I.second);
|
|
|
|
}
|
|
|
|
std::sort(ValueRotsVec.begin(), ValueRotsVec.end());
|
|
|
|
}
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
// In 64-bit mode, rlwinm and friends have a rotation operator that
|
|
|
|
// replicates the low-order 32 bits into the high-order 32-bits. The mask
|
|
|
|
// indices of these instructions can only be in the lower 32 bits, so they
|
|
|
|
// can only represent some 64-bit bit groups. However, when they can be used,
|
|
|
|
// the 32-bit replication can be used to represent, as a single bit group,
|
|
|
|
// otherwise separate bit groups. We'll convert to replicated-32-bit bit
|
|
|
|
// groups when possible. Returns true if any of the bit groups were
|
|
|
|
// converted.
|
|
|
|
void assignRepl32BitGroups() {
|
|
|
|
// If we have bits like this:
|
|
|
|
//
|
|
|
|
// Indices: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
|
|
|
|
// V bits: ... 7 6 5 4 3 2 1 0 31 30 29 28 27 26 25 24
|
|
|
|
// Groups: | RLAmt = 8 | RLAmt = 40 |
|
|
|
|
//
|
|
|
|
// But, making use of a 32-bit operation that replicates the low-order 32
|
|
|
|
// bits into the high-order 32 bits, this can be one bit group with a RLAmt
|
|
|
|
// of 8.
|
|
|
|
|
|
|
|
auto IsAllLow32 = [this](BitGroup & BG) {
|
|
|
|
if (BG.StartIdx <= BG.EndIdx) {
|
|
|
|
for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i) {
|
|
|
|
if (!Bits[i].hasValue())
|
|
|
|
continue;
|
|
|
|
if (Bits[i].getValueBitIndex() >= 32)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (unsigned i = BG.StartIdx; i < Bits.size(); ++i) {
|
|
|
|
if (!Bits[i].hasValue())
|
|
|
|
continue;
|
|
|
|
if (Bits[i].getValueBitIndex() >= 32)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
for (unsigned i = 0; i <= BG.EndIdx; ++i) {
|
|
|
|
if (!Bits[i].hasValue())
|
|
|
|
continue;
|
|
|
|
if (Bits[i].getValueBitIndex() >= 32)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
|
|
|
for (auto &BG : BitGroups) {
|
|
|
|
if (BG.StartIdx < 32 && BG.EndIdx < 32) {
|
|
|
|
if (IsAllLow32(BG)) {
|
|
|
|
if (BG.RLAmt >= 32) {
|
|
|
|
BG.RLAmt -= 32;
|
|
|
|
BG.Repl32CR = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
BG.Repl32 = true;
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "\t32-bit replicated bit group for " <<
|
|
|
|
BG.V.getNode() << " RLAmt = " << BG.RLAmt <<
|
|
|
|
" [" << BG.StartIdx << ", " << BG.EndIdx << "]\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now walk through the bit groups, consolidating where possible.
|
|
|
|
for (auto I = BitGroups.begin(); I != BitGroups.end();) {
|
|
|
|
// We might want to remove this bit group by merging it with the previous
|
|
|
|
// group (which might be the ending group).
|
|
|
|
auto IP = (I == BitGroups.begin()) ?
|
|
|
|
std::prev(BitGroups.end()) : std::prev(I);
|
|
|
|
if (I->Repl32 && IP->Repl32 && I->V == IP->V && I->RLAmt == IP->RLAmt &&
|
|
|
|
I->StartIdx == (IP->EndIdx + 1) % 64 && I != IP) {
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "\tcombining 32-bit replicated bit group for " <<
|
|
|
|
I->V.getNode() << " RLAmt = " << I->RLAmt <<
|
|
|
|
" [" << I->StartIdx << ", " << I->EndIdx <<
|
|
|
|
"] with group with range [" <<
|
|
|
|
IP->StartIdx << ", " << IP->EndIdx << "]\n");
|
|
|
|
|
|
|
|
IP->EndIdx = I->EndIdx;
|
|
|
|
IP->Repl32CR = IP->Repl32CR || I->Repl32CR;
|
|
|
|
IP->Repl32Coalesced = true;
|
|
|
|
I = BitGroups.erase(I);
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
// There is a special case worth handling: If there is a single group
|
|
|
|
// covering the entire upper 32 bits, and it can be merged with both
|
|
|
|
// the next and previous groups (which might be the same group), then
|
|
|
|
// do so. If it is the same group (so there will be only one group in
|
|
|
|
// total), then we need to reverse the order of the range so that it
|
|
|
|
// covers the entire 64 bits.
|
|
|
|
if (I->StartIdx == 32 && I->EndIdx == 63) {
|
|
|
|
assert(std::next(I) == BitGroups.end() &&
|
|
|
|
"bit group ends at index 63 but there is another?");
|
|
|
|
auto IN = BitGroups.begin();
|
|
|
|
|
2016-05-06 07:19:08 +08:00
|
|
|
if (IP->Repl32 && IN->Repl32 && I->V == IP->V && I->V == IN->V &&
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
(I->RLAmt % 32) == IP->RLAmt && (I->RLAmt % 32) == IN->RLAmt &&
|
|
|
|
IP->EndIdx == 31 && IN->StartIdx == 0 && I != IP &&
|
|
|
|
IsAllLow32(*I)) {
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "\tcombining bit group for " <<
|
|
|
|
I->V.getNode() << " RLAmt = " << I->RLAmt <<
|
|
|
|
" [" << I->StartIdx << ", " << I->EndIdx <<
|
|
|
|
"] with 32-bit replicated groups with ranges [" <<
|
|
|
|
IP->StartIdx << ", " << IP->EndIdx << "] and [" <<
|
|
|
|
IN->StartIdx << ", " << IN->EndIdx << "]\n");
|
|
|
|
|
|
|
|
if (IP == IN) {
|
|
|
|
// There is only one other group; change it to cover the whole
|
|
|
|
// range (backward, so that it can still be Repl32 but cover the
|
|
|
|
// whole 64-bit range).
|
|
|
|
IP->StartIdx = 31;
|
|
|
|
IP->EndIdx = 30;
|
|
|
|
IP->Repl32CR = IP->Repl32CR || I->RLAmt >= 32;
|
|
|
|
IP->Repl32Coalesced = true;
|
|
|
|
I = BitGroups.erase(I);
|
|
|
|
} else {
|
|
|
|
// There are two separate groups, one before this group and one
|
|
|
|
// after us (at the beginning). We're going to remove this group,
|
|
|
|
// but also the group at the very beginning.
|
|
|
|
IP->EndIdx = IN->EndIdx;
|
|
|
|
IP->Repl32CR = IP->Repl32CR || IN->Repl32CR || I->RLAmt >= 32;
|
|
|
|
IP->Repl32Coalesced = true;
|
|
|
|
I = BitGroups.erase(I);
|
|
|
|
BitGroups.erase(BitGroups.begin());
|
|
|
|
}
|
|
|
|
|
|
|
|
// This must be the last group in the vector (and we might have
|
|
|
|
// just invalidated the iterator above), so break here.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
++I;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-06-12 23:39:02 +08:00
|
|
|
SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
|
2015-04-28 22:05:47 +08:00
|
|
|
return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
}
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
uint64_t getZerosMask() {
|
|
|
|
uint64_t Mask = 0;
|
|
|
|
for (unsigned i = 0; i < Bits.size(); ++i) {
|
|
|
|
if (Bits[i].hasValue())
|
|
|
|
continue;
|
2015-01-02 03:33:59 +08:00
|
|
|
Mask |= (UINT64_C(1) << i);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return ~Mask;
|
|
|
|
}
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
// Depending on the number of groups for a particular value, it might be
|
|
|
|
// better to rotate, mask explicitly (using andi/andis), and then or the
|
|
|
|
// result. Select this part of the result first.
|
2016-06-12 23:39:02 +08:00
|
|
|
void SelectAndParts32(const SDLoc &dl, SDValue &Res, unsigned *InstCnt) {
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if (BPermRewriterNoMasking)
|
|
|
|
return;
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
|
|
|
|
for (ValueRotInfo &VRI : ValueRotsVec) {
|
|
|
|
unsigned Mask = 0;
|
|
|
|
for (unsigned i = 0; i < Bits.size(); ++i) {
|
|
|
|
if (!Bits[i].hasValue() || Bits[i].getValue() != VRI.V)
|
|
|
|
continue;
|
|
|
|
if (RLAmt[i] != VRI.RLAmt)
|
|
|
|
continue;
|
|
|
|
Mask |= (1u << i);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compute the masks for andi/andis that would be necessary.
|
|
|
|
unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16;
|
|
|
|
assert((ANDIMask != 0 || ANDISMask != 0) &&
|
|
|
|
"No set bits in mask for value bit groups");
|
|
|
|
bool NeedsRotate = VRI.RLAmt != 0;
|
|
|
|
|
|
|
|
// We're trying to minimize the number of instructions. If we have one
|
|
|
|
// group, using one of andi/andis can break even. If we have three
|
|
|
|
// groups, we can use both andi and andis and break even (to use both
|
|
|
|
// andi and andis we also need to or the results together). We need four
|
|
|
|
// groups if we also need to rotate. To use andi/andis we need to do more
|
|
|
|
// than break even because rotate-and-mask instructions tend to be easier
|
|
|
|
// to schedule.
|
|
|
|
|
|
|
|
// FIXME: We've biased here against using andi/andis, which is right for
|
|
|
|
// POWER cores, but not optimal everywhere. For example, on the A2,
|
|
|
|
// andi/andis have single-cycle latency whereas the rotate-and-mask
|
|
|
|
// instructions take two cycles, and it would be better to bias toward
|
|
|
|
// andi/andis in break-even cases.
|
|
|
|
|
|
|
|
unsigned NumAndInsts = (unsigned) NeedsRotate +
|
|
|
|
(unsigned) (ANDIMask != 0) +
|
|
|
|
(unsigned) (ANDISMask != 0) +
|
|
|
|
(unsigned) (ANDIMask != 0 && ANDISMask != 0) +
|
|
|
|
(unsigned) (bool) Res;
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
|
|
|
|
DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
|
|
|
|
" RL: " << VRI.RLAmt << ":" <<
|
|
|
|
"\n\t\t\tisel using masking: " << NumAndInsts <<
|
|
|
|
" using rotates: " << VRI.NumGroups << "\n");
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
if (NumAndInsts >= VRI.NumGroups)
|
|
|
|
continue;
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
DEBUG(dbgs() << "\t\t\t\tusing masking\n");
|
|
|
|
|
|
|
|
if (InstCnt) *InstCnt += NumAndInsts;
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
SDValue VRot;
|
|
|
|
if (VRI.RLAmt) {
|
|
|
|
SDValue Ops[] =
|
2015-04-28 22:05:47 +08:00
|
|
|
{ VRI.V, getI32Imm(VRI.RLAmt, dl), getI32Imm(0, dl),
|
|
|
|
getI32Imm(31, dl) };
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
VRot = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
|
|
|
|
Ops), 0);
|
|
|
|
} else {
|
|
|
|
VRot = VRI.V;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue ANDIVal, ANDISVal;
|
|
|
|
if (ANDIMask != 0)
|
|
|
|
ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32,
|
2015-04-28 22:05:47 +08:00
|
|
|
VRot, getI32Imm(ANDIMask, dl)), 0);
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
if (ANDISMask != 0)
|
|
|
|
ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32,
|
2015-04-28 22:05:47 +08:00
|
|
|
VRot, getI32Imm(ANDISMask, dl)), 0);
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
|
|
|
|
SDValue TotalVal;
|
|
|
|
if (!ANDIVal)
|
|
|
|
TotalVal = ANDISVal;
|
|
|
|
else if (!ANDISVal)
|
|
|
|
TotalVal = ANDIVal;
|
|
|
|
else
|
|
|
|
TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
|
|
|
|
ANDIVal, ANDISVal), 0);
|
|
|
|
|
|
|
|
if (!Res)
|
|
|
|
Res = TotalVal;
|
|
|
|
else
|
|
|
|
Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
|
|
|
|
Res, TotalVal), 0);
|
|
|
|
|
|
|
|
// Now, remove all groups with this underlying value and rotation
|
|
|
|
// factor.
|
2015-06-20 23:59:41 +08:00
|
|
|
eraseMatchingBitGroups([VRI](const BitGroup &BG) {
|
|
|
|
return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt;
|
|
|
|
});
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Instruction selection for the 32-bit case.
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
SDNode *Select32(SDNode *N, bool LateMask, unsigned *InstCnt) {
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
SDLoc dl(N);
|
|
|
|
SDValue Res;
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if (InstCnt) *InstCnt = 0;
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
// Take care of cases that should use andi/andis first.
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
SelectAndParts32(dl, Res, InstCnt);
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
|
|
|
|
// If we've not yet selected a 'starting' instruction, and we have no zeros
|
|
|
|
// to fill in, select the (Value, RLAmt) with the highest priority (largest
|
|
|
|
// number of groups), and start with this rotated value.
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if ((!HasZeros || LateMask) && !Res) {
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
ValueRotInfo &VRI = ValueRotsVec[0];
|
|
|
|
if (VRI.RLAmt) {
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if (InstCnt) *InstCnt += 1;
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
SDValue Ops[] =
|
2015-04-28 22:05:47 +08:00
|
|
|
{ VRI.V, getI32Imm(VRI.RLAmt, dl), getI32Imm(0, dl),
|
|
|
|
getI32Imm(31, dl) };
|
|
|
|
Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops),
|
|
|
|
0);
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
} else {
|
|
|
|
Res = VRI.V;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now, remove all groups with this underlying value and rotation factor.
|
2015-06-20 23:59:41 +08:00
|
|
|
eraseMatchingBitGroups([VRI](const BitGroup &BG) {
|
|
|
|
return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt;
|
|
|
|
});
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
}
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if (InstCnt) *InstCnt += BitGroups.size();
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
// Insert the other groups (one at a time).
|
|
|
|
for (auto &BG : BitGroups) {
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if (!Res) {
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
SDValue Ops[] =
|
2015-04-28 22:05:47 +08:00
|
|
|
{ BG.V, getI32Imm(BG.RLAmt, dl),
|
|
|
|
getI32Imm(Bits.size() - BG.EndIdx - 1, dl),
|
|
|
|
getI32Imm(Bits.size() - BG.StartIdx - 1, dl) };
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
|
|
|
|
} else {
|
|
|
|
SDValue Ops[] =
|
2015-04-28 22:05:47 +08:00
|
|
|
{ Res, BG.V, getI32Imm(BG.RLAmt, dl),
|
|
|
|
getI32Imm(Bits.size() - BG.EndIdx - 1, dl),
|
|
|
|
getI32Imm(Bits.size() - BG.StartIdx - 1, dl) };
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
Res = SDValue(CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops), 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if (LateMask) {
|
|
|
|
unsigned Mask = (unsigned) getZerosMask();
|
|
|
|
|
|
|
|
unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16;
|
|
|
|
assert((ANDIMask != 0 || ANDISMask != 0) &&
|
|
|
|
"No set bits in zeros mask?");
|
|
|
|
|
|
|
|
if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) +
|
|
|
|
(unsigned) (ANDISMask != 0) +
|
|
|
|
(unsigned) (ANDIMask != 0 && ANDISMask != 0);
|
|
|
|
|
|
|
|
SDValue ANDIVal, ANDISVal;
|
|
|
|
if (ANDIMask != 0)
|
|
|
|
ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32,
|
2015-04-28 22:05:47 +08:00
|
|
|
Res, getI32Imm(ANDIMask, dl)), 0);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if (ANDISMask != 0)
|
|
|
|
ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32,
|
2015-04-28 22:05:47 +08:00
|
|
|
Res, getI32Imm(ANDISMask, dl)), 0);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
|
|
|
|
if (!ANDIVal)
|
|
|
|
Res = ANDISVal;
|
|
|
|
else if (!ANDISVal)
|
|
|
|
Res = ANDIVal;
|
|
|
|
else
|
|
|
|
Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
|
|
|
|
ANDIVal, ANDISVal), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Res.getNode();
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned SelectRotMask64Count(unsigned RLAmt, bool Repl32,
|
|
|
|
unsigned MaskStart, unsigned MaskEnd,
|
|
|
|
bool IsIns) {
|
|
|
|
// In the notation used by the instructions, 'start' and 'end' are reversed
|
|
|
|
// because bits are counted from high to low order.
|
|
|
|
unsigned InstMaskStart = 64 - MaskEnd - 1,
|
|
|
|
InstMaskEnd = 64 - MaskStart - 1;
|
|
|
|
|
|
|
|
if (Repl32)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if ((!IsIns && (InstMaskEnd == 63 || InstMaskStart == 0)) ||
|
|
|
|
InstMaskEnd == 63 - RLAmt)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
// For 64-bit values, not all combinations of rotates and masks are
|
|
|
|
// available. Produce one if it is available.
|
2016-06-12 23:39:02 +08:00
|
|
|
SDValue SelectRotMask64(SDValue V, const SDLoc &dl, unsigned RLAmt,
|
|
|
|
bool Repl32, unsigned MaskStart, unsigned MaskEnd,
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
unsigned *InstCnt = nullptr) {
|
|
|
|
// In the notation used by the instructions, 'start' and 'end' are reversed
|
|
|
|
// because bits are counted from high to low order.
|
|
|
|
unsigned InstMaskStart = 64 - MaskEnd - 1,
|
|
|
|
InstMaskEnd = 64 - MaskStart - 1;
|
|
|
|
|
|
|
|
if (InstCnt) *InstCnt += 1;
|
|
|
|
|
|
|
|
if (Repl32) {
|
|
|
|
// This rotation amount assumes that the lower 32 bits of the quantity
|
|
|
|
// are replicated in the high 32 bits by the rotation operator (which is
|
|
|
|
// done by rlwinm and friends).
|
|
|
|
assert(InstMaskStart >= 32 && "Mask cannot start out of range");
|
|
|
|
assert(InstMaskEnd >= 32 && "Mask cannot end out of range");
|
|
|
|
SDValue Ops[] =
|
2015-04-28 22:05:47 +08:00
|
|
|
{ V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl),
|
|
|
|
getI32Imm(InstMaskEnd - 32, dl) };
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::RLWINM8, dl, MVT::i64,
|
|
|
|
Ops), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (InstMaskEnd == 63) {
|
|
|
|
SDValue Ops[] =
|
2015-04-28 22:05:47 +08:00
|
|
|
{ V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Ops), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (InstMaskStart == 0) {
|
|
|
|
SDValue Ops[] =
|
2015-04-28 22:05:47 +08:00
|
|
|
{ V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskEnd, dl) };
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Ops), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (InstMaskEnd == 63 - RLAmt) {
|
|
|
|
SDValue Ops[] =
|
2015-04-28 22:05:47 +08:00
|
|
|
{ V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, Ops), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// We cannot do this with a single instruction, so we'll use two. The
|
|
|
|
// problem is that we're not free to choose both a rotation amount and mask
|
|
|
|
// start and end independently. We can choose an arbitrary mask start and
|
|
|
|
// end, but then the rotation amount is fixed. Rotation, however, can be
|
|
|
|
// inverted, and so by applying an "inverse" rotation first, we can get the
|
|
|
|
// desired result.
|
|
|
|
if (InstCnt) *InstCnt += 1;
|
|
|
|
|
|
|
|
// The rotation mask for the second instruction must be MaskStart.
|
|
|
|
unsigned RLAmt2 = MaskStart;
|
|
|
|
// The first instruction must rotate V so that the overall rotation amount
|
|
|
|
// is RLAmt.
|
|
|
|
unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64;
|
|
|
|
if (RLAmt1)
|
|
|
|
V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63);
|
|
|
|
return SelectRotMask64(V, dl, RLAmt2, false, MaskStart, MaskEnd);
|
|
|
|
}
|
|
|
|
|
|
|
|
// For 64-bit values, not all combinations of rotates and masks are
|
|
|
|
// available. Produce a rotate-mask-and-insert if one is available.
|
2016-06-12 23:39:02 +08:00
|
|
|
SDValue SelectRotMaskIns64(SDValue Base, SDValue V, const SDLoc &dl,
|
|
|
|
unsigned RLAmt, bool Repl32, unsigned MaskStart,
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
unsigned MaskEnd, unsigned *InstCnt = nullptr) {
|
|
|
|
// In the notation used by the instructions, 'start' and 'end' are reversed
|
|
|
|
// because bits are counted from high to low order.
|
|
|
|
unsigned InstMaskStart = 64 - MaskEnd - 1,
|
|
|
|
InstMaskEnd = 64 - MaskStart - 1;
|
|
|
|
|
|
|
|
if (InstCnt) *InstCnt += 1;
|
|
|
|
|
|
|
|
if (Repl32) {
|
|
|
|
// This rotation amount assumes that the lower 32 bits of the quantity
|
|
|
|
// are replicated in the high 32 bits by the rotation operator (which is
|
|
|
|
// done by rlwinm and friends).
|
|
|
|
assert(InstMaskStart >= 32 && "Mask cannot start out of range");
|
|
|
|
assert(InstMaskEnd >= 32 && "Mask cannot end out of range");
|
|
|
|
SDValue Ops[] =
|
2015-04-28 22:05:47 +08:00
|
|
|
{ Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl),
|
|
|
|
getI32Imm(InstMaskEnd - 32, dl) };
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::RLWIMI8, dl, MVT::i64,
|
|
|
|
Ops), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (InstMaskEnd == 63 - RLAmt) {
|
|
|
|
SDValue Ops[] =
|
2015-04-28 22:05:47 +08:00
|
|
|
{ Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// We cannot do this with a single instruction, so we'll use two. The
|
|
|
|
// problem is that we're not free to choose both a rotation amount and mask
|
|
|
|
// start and end independently. We can choose an arbitrary mask start and
|
|
|
|
// end, but then the rotation amount is fixed. Rotation, however, can be
|
|
|
|
// inverted, and so by applying an "inverse" rotation first, we can get the
|
|
|
|
// desired result.
|
|
|
|
if (InstCnt) *InstCnt += 1;
|
|
|
|
|
|
|
|
// The rotation mask for the second instruction must be MaskStart.
|
|
|
|
unsigned RLAmt2 = MaskStart;
|
|
|
|
// The first instruction must rotate V so that the overall rotation amount
|
|
|
|
// is RLAmt.
|
|
|
|
unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64;
|
|
|
|
if (RLAmt1)
|
|
|
|
V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63);
|
|
|
|
return SelectRotMaskIns64(Base, V, dl, RLAmt2, false, MaskStart, MaskEnd);
|
|
|
|
}
|
|
|
|
|
2016-06-12 23:39:02 +08:00
|
|
|
void SelectAndParts64(const SDLoc &dl, SDValue &Res, unsigned *InstCnt) {
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if (BPermRewriterNoMasking)
|
|
|
|
return;
|
|
|
|
|
|
|
|
// The idea here is the same as in the 32-bit version, but with additional
|
|
|
|
// complications from the fact that Repl32 might be true. Because we
|
|
|
|
// aggressively convert bit groups to Repl32 form (which, for small
|
|
|
|
// rotation factors, involves no other change), and then coalesce, it might
|
|
|
|
// be the case that a single 64-bit masking operation could handle both
|
|
|
|
// some Repl32 groups and some non-Repl32 groups. If converting to Repl32
|
|
|
|
// form allowed coalescing, then we must use a 32-bit rotaton in order to
|
|
|
|
// completely capture the new combined bit group.
|
|
|
|
|
|
|
|
for (ValueRotInfo &VRI : ValueRotsVec) {
|
|
|
|
uint64_t Mask = 0;
|
|
|
|
|
|
|
|
// We need to add to the mask all bits from the associated bit groups.
|
|
|
|
// If Repl32 is false, we need to add bits from bit groups that have
|
|
|
|
// Repl32 true, but are trivially convertable to Repl32 false. Such a
|
|
|
|
// group is trivially convertable if it overlaps only with the lower 32
|
|
|
|
// bits, and the group has not been coalesced.
|
2015-06-20 23:59:41 +08:00
|
|
|
auto MatchingBG = [VRI](const BitGroup &BG) {
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if (VRI.V != BG.V)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
unsigned EffRLAmt = BG.RLAmt;
|
|
|
|
if (!VRI.Repl32 && BG.Repl32) {
|
|
|
|
if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx <= BG.EndIdx &&
|
|
|
|
!BG.Repl32Coalesced) {
|
|
|
|
if (BG.Repl32CR)
|
|
|
|
EffRLAmt += 32;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
} else if (VRI.Repl32 != BG.Repl32) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-12-28 21:38:42 +08:00
|
|
|
return VRI.RLAmt == EffRLAmt;
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
for (auto &BG : BitGroups) {
|
|
|
|
if (!MatchingBG(BG))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (BG.StartIdx <= BG.EndIdx) {
|
|
|
|
for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i)
|
2015-01-02 03:33:59 +08:00
|
|
|
Mask |= (UINT64_C(1) << i);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
} else {
|
|
|
|
for (unsigned i = BG.StartIdx; i < Bits.size(); ++i)
|
2015-01-02 03:33:59 +08:00
|
|
|
Mask |= (UINT64_C(1) << i);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
for (unsigned i = 0; i <= BG.EndIdx; ++i)
|
2015-01-02 03:33:59 +08:00
|
|
|
Mask |= (UINT64_C(1) << i);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// We can use the 32-bit andi/andis technique if the mask does not
|
|
|
|
// require any higher-order bits. This can save an instruction compared
|
|
|
|
// to always using the general 64-bit technique.
|
|
|
|
bool Use32BitInsts = isUInt<32>(Mask);
|
|
|
|
// Compute the masks for andi/andis that would be necessary.
|
|
|
|
unsigned ANDIMask = (Mask & UINT16_MAX),
|
|
|
|
ANDISMask = (Mask >> 16) & UINT16_MAX;
|
|
|
|
|
|
|
|
bool NeedsRotate = VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask));
|
|
|
|
|
|
|
|
unsigned NumAndInsts = (unsigned) NeedsRotate +
|
|
|
|
(unsigned) (bool) Res;
|
|
|
|
if (Use32BitInsts)
|
|
|
|
NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) +
|
|
|
|
(unsigned) (ANDIMask != 0 && ANDISMask != 0);
|
|
|
|
else
|
2016-05-21 05:43:23 +08:00
|
|
|
NumAndInsts += getInt64Count(Mask) + /* and */ 1;
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
|
|
|
|
unsigned NumRLInsts = 0;
|
|
|
|
bool FirstBG = true;
|
|
|
|
for (auto &BG : BitGroups) {
|
|
|
|
if (!MatchingBG(BG))
|
|
|
|
continue;
|
|
|
|
NumRLInsts +=
|
|
|
|
SelectRotMask64Count(BG.RLAmt, BG.Repl32, BG.StartIdx, BG.EndIdx,
|
|
|
|
!FirstBG);
|
|
|
|
FirstBG = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
|
|
|
|
" RL: " << VRI.RLAmt << (VRI.Repl32 ? " (32):" : ":") <<
|
|
|
|
"\n\t\t\tisel using masking: " << NumAndInsts <<
|
|
|
|
" using rotates: " << NumRLInsts << "\n");
|
|
|
|
|
|
|
|
// When we'd use andi/andis, we bias toward using the rotates (andi only
|
|
|
|
// has a record form, and is cracked on POWER cores). However, when using
|
|
|
|
// general 64-bit constant formation, bias toward the constant form,
|
|
|
|
// because that exposes more opportunities for CSE.
|
|
|
|
if (NumAndInsts > NumRLInsts)
|
|
|
|
continue;
|
|
|
|
if (Use32BitInsts && NumAndInsts == NumRLInsts)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "\t\t\t\tusing masking\n");
|
|
|
|
|
|
|
|
if (InstCnt) *InstCnt += NumAndInsts;
|
|
|
|
|
|
|
|
SDValue VRot;
|
|
|
|
// We actually need to generate a rotation if we have a non-zero rotation
|
|
|
|
// factor or, in the Repl32 case, if we care about any of the
|
|
|
|
// higher-order replicated bits. In the latter case, we generate a mask
|
|
|
|
// backward so that it actually includes the entire 64 bits.
|
|
|
|
if (VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask)))
|
|
|
|
VRot = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32,
|
|
|
|
VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63);
|
|
|
|
else
|
|
|
|
VRot = VRI.V;
|
|
|
|
|
|
|
|
SDValue TotalVal;
|
|
|
|
if (Use32BitInsts) {
|
|
|
|
assert((ANDIMask != 0 || ANDISMask != 0) &&
|
|
|
|
"No set bits in mask when using 32-bit ands for 64-bit value");
|
|
|
|
|
|
|
|
SDValue ANDIVal, ANDISVal;
|
|
|
|
if (ANDIMask != 0)
|
|
|
|
ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
|
2015-04-28 22:05:47 +08:00
|
|
|
VRot, getI32Imm(ANDIMask, dl)), 0);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if (ANDISMask != 0)
|
|
|
|
ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
|
2015-04-28 22:05:47 +08:00
|
|
|
VRot, getI32Imm(ANDISMask, dl)), 0);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
|
|
|
|
if (!ANDIVal)
|
|
|
|
TotalVal = ANDISVal;
|
|
|
|
else if (!ANDISVal)
|
|
|
|
TotalVal = ANDIVal;
|
|
|
|
else
|
|
|
|
TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
|
|
|
|
ANDIVal, ANDISVal), 0);
|
|
|
|
} else {
|
2016-05-21 05:43:23 +08:00
|
|
|
TotalVal = SDValue(getInt64(CurDAG, dl, Mask), 0);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
TotalVal =
|
|
|
|
SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
|
|
|
|
VRot, TotalVal), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!Res)
|
|
|
|
Res = TotalVal;
|
|
|
|
else
|
|
|
|
Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
|
|
|
|
Res, TotalVal), 0);
|
|
|
|
|
|
|
|
// Now, remove all groups with this underlying value and rotation
|
|
|
|
// factor.
|
2015-06-20 23:59:41 +08:00
|
|
|
eraseMatchingBitGroups(MatchingBG);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Instruction selection for the 64-bit case.
|
|
|
|
SDNode *Select64(SDNode *N, bool LateMask, unsigned *InstCnt) {
|
|
|
|
SDLoc dl(N);
|
|
|
|
SDValue Res;
|
|
|
|
|
|
|
|
if (InstCnt) *InstCnt = 0;
|
|
|
|
|
|
|
|
// Take care of cases that should use andi/andis first.
|
|
|
|
SelectAndParts64(dl, Res, InstCnt);
|
|
|
|
|
|
|
|
// If we've not yet selected a 'starting' instruction, and we have no zeros
|
|
|
|
// to fill in, select the (Value, RLAmt) with the highest priority (largest
|
|
|
|
// number of groups), and start with this rotated value.
|
|
|
|
if ((!HasZeros || LateMask) && !Res) {
|
|
|
|
// If we have both Repl32 groups and non-Repl32 groups, the non-Repl32
|
|
|
|
// groups will come first, and so the VRI representing the largest number
|
|
|
|
// of groups might not be first (it might be the first Repl32 groups).
|
|
|
|
unsigned MaxGroupsIdx = 0;
|
|
|
|
if (!ValueRotsVec[0].Repl32) {
|
|
|
|
for (unsigned i = 0, ie = ValueRotsVec.size(); i < ie; ++i)
|
|
|
|
if (ValueRotsVec[i].Repl32) {
|
|
|
|
if (ValueRotsVec[i].NumGroups > ValueRotsVec[0].NumGroups)
|
|
|
|
MaxGroupsIdx = i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ValueRotInfo &VRI = ValueRotsVec[MaxGroupsIdx];
|
|
|
|
bool NeedsRotate = false;
|
|
|
|
if (VRI.RLAmt) {
|
|
|
|
NeedsRotate = true;
|
|
|
|
} else if (VRI.Repl32) {
|
|
|
|
for (auto &BG : BitGroups) {
|
|
|
|
if (BG.V != VRI.V || BG.RLAmt != VRI.RLAmt ||
|
|
|
|
BG.Repl32 != VRI.Repl32)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// We don't need a rotate if the bit group is confined to the lower
|
|
|
|
// 32 bits.
|
|
|
|
if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx < BG.EndIdx)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
NeedsRotate = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (NeedsRotate)
|
|
|
|
Res = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32,
|
|
|
|
VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63,
|
|
|
|
InstCnt);
|
|
|
|
else
|
|
|
|
Res = VRI.V;
|
|
|
|
|
|
|
|
// Now, remove all groups with this underlying value and rotation factor.
|
|
|
|
if (Res)
|
2015-06-20 23:59:41 +08:00
|
|
|
eraseMatchingBitGroups([VRI](const BitGroup &BG) {
|
|
|
|
return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt &&
|
|
|
|
BG.Repl32 == VRI.Repl32;
|
|
|
|
});
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Because 64-bit rotates are more flexible than inserts, we might have a
|
|
|
|
// preference regarding which one we do first (to save one instruction).
|
|
|
|
if (!Res)
|
|
|
|
for (auto I = BitGroups.begin(), IE = BitGroups.end(); I != IE; ++I) {
|
|
|
|
if (SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx,
|
|
|
|
false) <
|
|
|
|
SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx,
|
|
|
|
true)) {
|
|
|
|
if (I != BitGroups.begin()) {
|
|
|
|
BitGroup BG = *I;
|
|
|
|
BitGroups.erase(I);
|
|
|
|
BitGroups.insert(BitGroups.begin(), BG);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Insert the other groups (one at a time).
|
|
|
|
for (auto &BG : BitGroups) {
|
|
|
|
if (!Res)
|
|
|
|
Res = SelectRotMask64(BG.V, dl, BG.RLAmt, BG.Repl32, BG.StartIdx,
|
|
|
|
BG.EndIdx, InstCnt);
|
|
|
|
else
|
|
|
|
Res = SelectRotMaskIns64(Res, BG.V, dl, BG.RLAmt, BG.Repl32,
|
|
|
|
BG.StartIdx, BG.EndIdx, InstCnt);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (LateMask) {
|
|
|
|
uint64_t Mask = getZerosMask();
|
|
|
|
|
|
|
|
// We can use the 32-bit andi/andis technique if the mask does not
|
|
|
|
// require any higher-order bits. This can save an instruction compared
|
|
|
|
// to always using the general 64-bit technique.
|
|
|
|
bool Use32BitInsts = isUInt<32>(Mask);
|
|
|
|
// Compute the masks for andi/andis that would be necessary.
|
|
|
|
unsigned ANDIMask = (Mask & UINT16_MAX),
|
|
|
|
ANDISMask = (Mask >> 16) & UINT16_MAX;
|
|
|
|
|
|
|
|
if (Use32BitInsts) {
|
|
|
|
assert((ANDIMask != 0 || ANDISMask != 0) &&
|
|
|
|
"No set bits in mask when using 32-bit ands for 64-bit value");
|
|
|
|
|
|
|
|
if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) +
|
|
|
|
(unsigned) (ANDISMask != 0) +
|
|
|
|
(unsigned) (ANDIMask != 0 && ANDISMask != 0);
|
|
|
|
|
|
|
|
SDValue ANDIVal, ANDISVal;
|
|
|
|
if (ANDIMask != 0)
|
|
|
|
ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
|
2015-04-28 22:05:47 +08:00
|
|
|
Res, getI32Imm(ANDIMask, dl)), 0);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if (ANDISMask != 0)
|
|
|
|
ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
|
2015-04-28 22:05:47 +08:00
|
|
|
Res, getI32Imm(ANDISMask, dl)), 0);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
|
|
|
|
if (!ANDIVal)
|
|
|
|
Res = ANDISVal;
|
|
|
|
else if (!ANDISVal)
|
|
|
|
Res = ANDIVal;
|
|
|
|
else
|
|
|
|
Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
|
|
|
|
ANDIVal, ANDISVal), 0);
|
|
|
|
} else {
|
2016-05-21 05:43:23 +08:00
|
|
|
if (InstCnt) *InstCnt += getInt64Count(Mask) + /* and */ 1;
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
SDValue MaskVal = SDValue(getInt64(CurDAG, dl, Mask), 0);
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
Res =
|
|
|
|
SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
|
|
|
|
Res, MaskVal), 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
return Res.getNode();
|
|
|
|
}
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
SDNode *Select(SDNode *N, bool LateMask, unsigned *InstCnt = nullptr) {
|
|
|
|
// Fill in BitGroups.
|
|
|
|
collectBitGroups(LateMask);
|
|
|
|
if (BitGroups.empty())
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
// For 64-bit values, figure out when we can use 32-bit instructions.
|
|
|
|
if (Bits.size() == 64)
|
|
|
|
assignRepl32BitGroups();
|
|
|
|
|
|
|
|
// Fill in ValueRotsVec.
|
|
|
|
collectValueRotInfo();
|
|
|
|
|
|
|
|
if (Bits.size() == 32) {
|
|
|
|
return Select32(N, LateMask, InstCnt);
|
|
|
|
} else {
|
|
|
|
assert(Bits.size() == 64 && "Not 64 bits here?");
|
|
|
|
return Select64(N, LateMask, InstCnt);
|
|
|
|
}
|
|
|
|
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2015-06-20 23:59:41 +08:00
|
|
|
void eraseMatchingBitGroups(function_ref<bool(const BitGroup &)> F) {
|
|
|
|
BitGroups.erase(std::remove_if(BitGroups.begin(), BitGroups.end(), F),
|
|
|
|
BitGroups.end());
|
|
|
|
}
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
SmallVector<ValueBit, 64> Bits;
|
|
|
|
|
|
|
|
bool HasZeros;
|
|
|
|
SmallVector<unsigned, 64> RLAmt;
|
|
|
|
|
|
|
|
SmallVector<BitGroup, 16> BitGroups;
|
|
|
|
|
|
|
|
DenseMap<std::pair<SDValue, unsigned>, ValueRotInfo> ValueRots;
|
|
|
|
SmallVector<ValueRotInfo, 16> ValueRotsVec;
|
|
|
|
|
|
|
|
SelectionDAG *CurDAG;
|
|
|
|
|
|
|
|
public:
|
|
|
|
BitPermutationSelector(SelectionDAG *DAG)
|
|
|
|
: CurDAG(DAG) {}
|
|
|
|
|
|
|
|
// Here we try to match complex bit permutations into a set of
|
|
|
|
// rotate-and-shift/shift/and/or instructions, using a set of heuristics
|
|
|
|
// known to produce optimial code for common cases (like i32 byte swapping).
|
|
|
|
SDNode *Select(SDNode *N) {
|
|
|
|
Bits.resize(N->getValueType(0).getSizeInBits());
|
|
|
|
if (!getValueBits(SDValue(N, 0), Bits))
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "Considering bit-permutation-based instruction"
|
|
|
|
" selection for: ");
|
|
|
|
DEBUG(N->dump(CurDAG));
|
|
|
|
|
|
|
|
// Fill it RLAmt and set HasZeros.
|
|
|
|
computeRotationAmounts();
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if (!HasZeros)
|
|
|
|
return Select(N, false);
|
|
|
|
|
|
|
|
// We currently have two techniques for handling results with zeros: early
|
|
|
|
// masking (the default) and late masking. Late masking is sometimes more
|
|
|
|
// efficient, but because the structure of the bit groups is different, it
|
|
|
|
// is hard to tell without generating both and comparing the results. With
|
|
|
|
// late masking, we ignore zeros in the resulting value when inserting each
|
|
|
|
// set of bit groups, and then mask in the zeros at the end. With early
|
|
|
|
// masking, we only insert the non-zero parts of the result at every step.
|
|
|
|
|
|
|
|
unsigned InstCnt, InstCntLateMask;
|
|
|
|
DEBUG(dbgs() << "\tEarly masking:\n");
|
|
|
|
SDNode *RN = Select(N, false, &InstCnt);
|
|
|
|
DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n");
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "\tLate masking:\n");
|
|
|
|
SDNode *RNLM = Select(N, true, &InstCntLateMask);
|
|
|
|
DEBUG(dbgs() << "\t\tisel would use " << InstCntLateMask <<
|
|
|
|
" instructions\n");
|
|
|
|
|
|
|
|
if (InstCnt <= InstCntLateMask) {
|
|
|
|
DEBUG(dbgs() << "\tUsing early-masking for isel\n");
|
|
|
|
return RN;
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
}
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
DEBUG(dbgs() << "\tUsing late-masking for isel\n");
|
|
|
|
return RNLM;
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
}
|
|
|
|
};
|
|
|
|
} // anonymous namespace
|
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) {
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
if (N->getValueType(0) != MVT::i32 &&
|
|
|
|
N->getValueType(0) != MVT::i64)
|
2016-05-21 05:43:23 +08:00
|
|
|
return false;
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
if (!UseBitPermRewriter)
|
2016-05-21 05:43:23 +08:00
|
|
|
return false;
|
[PowerPC] Improve instruction selection bit-permuting operations (64-bit)
This is the second installment of improvements to instruction selection for "bit
permutation" instruction sequences. r224318 added logic for instruction
selection for 32-bit bit permutation sequences, and this adds lowering for
64-bit sequences. The 64-bit sequences are more complicated than the 32-bit
ones because:
a) the 64-bit versions of the 32-bit rotate-and-mask instructions
work by replicating the lower 32-bits of the value-to-be-rotated into the
upper 32 bits -- and integrating this into the cost modeling for the various
bit group operations is non-trivial
b) unlike the 32-bit instructions in 32-bit mode, the rotate-and-mask instructions
cannot, in one instruction, specify the
mask starting index, the mask ending index, and the rotation factor. Also,
forming arbitrary 64-bit constants is more complicated than in 32-bit mode
because the number of instructions necessary is value dependent.
Plus, support for 'late masking' was added: it is sometimes more efficient to
treat the overall value as if it had no mandatory zero bits when planning the
bit-group insertions, and then mask them in at the very end. Unfortunately, as
the structure of the bit groups is different in the two cases, the more
feasible implementation technique was to generate both instruction sequences,
and then pick the shorter one.
And finally, we now generate reasonable code for i64 bswap:
rldicl 5, 3, 16, 0
rldicl 4, 3, 8, 0
rldicl 6, 3, 24, 0
rldimi 4, 5, 8, 48
rldicl 5, 3, 32, 0
rldimi 4, 6, 16, 40
rldicl 6, 3, 48, 0
rldimi 4, 5, 24, 32
rldicl 5, 3, 56, 0
rldimi 4, 6, 40, 16
rldimi 4, 5, 48, 8
rldimi 4, 3, 56, 0
vs. what we used to produce:
li 4, 255
rldicl 5, 3, 24, 40
rldicl 6, 3, 40, 24
rldicl 7, 3, 56, 8
sldi 8, 3, 8
sldi 10, 3, 24
sldi 12, 3, 40
rldicl 0, 3, 8, 56
sldi 9, 4, 32
sldi 11, 4, 40
sldi 4, 4, 48
andi. 5, 5, 65280
andis. 6, 6, 255
andis. 7, 7, 65280
sldi 3, 3, 56
and 8, 8, 9
and 4, 12, 4
and 9, 10, 11
or 6, 7, 6
or 5, 5, 0
or 3, 3, 4
or 7, 9, 8
or 4, 6, 5
or 3, 3, 7
or 3, 3, 4
which is 12 instructions, instead of 25, and seems optimal (at least in terms
of code size).
llvm-svn: 225056
2015-01-01 10:53:29 +08:00
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
switch (N->getOpcode()) {
|
|
|
|
default: break;
|
|
|
|
case ISD::ROTL:
|
|
|
|
case ISD::SHL:
|
|
|
|
case ISD::SRL:
|
|
|
|
case ISD::AND:
|
|
|
|
case ISD::OR: {
|
|
|
|
BitPermutationSelector BPS(CurDAG);
|
2016-05-21 05:43:23 +08:00
|
|
|
if (SDNode *New = BPS.Select(N)) {
|
|
|
|
ReplaceNode(N, New);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
return false;
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
}
|
|
|
|
|
2005-08-22 02:50:37 +08:00
|
|
|
/// SelectCC - Select a comparison of the specified values with the specified
|
|
|
|
/// condition code, returning the CR# of the expression.
|
2016-06-12 23:39:02 +08:00
|
|
|
SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
|
|
|
|
const SDLoc &dl) {
|
2005-08-22 02:50:37 +08:00
|
|
|
// Always select the LHS.
|
2006-06-27 08:04:13 +08:00
|
|
|
unsigned Opc;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2009-08-12 04:47:22 +08:00
|
|
|
if (LHS.getValueType() == MVT::i32) {
|
2006-06-27 08:10:13 +08:00
|
|
|
unsigned Imm;
|
Two improvements:
1. Codegen this comparison:
if (X == 0x8000)
as:
cmplwi cr0, r3, 32768
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 0
ori r2, r2, 32768
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
2. Codegen this comparison:
if (X == 0x12345678)
as:
xoris r2, r3, 4660
cmplwi cr0, r2, 22136
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 4660
ori r2, r2, 22136
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
llvm-svn: 30509
2006-09-20 12:25:47 +08:00
|
|
|
if (CC == ISD::SETEQ || CC == ISD::SETNE) {
|
|
|
|
if (isInt32Immediate(RHS, Imm)) {
|
|
|
|
// SETEQ/SETNE comparison with 16-bit immediate, fold it.
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isUInt<16>(Imm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
|
2015-04-28 22:05:47 +08:00
|
|
|
getI32Imm(Imm & 0xFFFF, dl)),
|
|
|
|
0);
|
Two improvements:
1. Codegen this comparison:
if (X == 0x8000)
as:
cmplwi cr0, r3, 32768
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 0
ori r2, r2, 32768
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
2. Codegen this comparison:
if (X == 0x12345678)
as:
xoris r2, r3, 4660
cmplwi cr0, r2, 22136
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 4660
ori r2, r2, 22136
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
llvm-svn: 30509
2006-09-20 12:25:47 +08:00
|
|
|
// If this is a 16-bit signed immediate, fold it.
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isInt<16>((int)Imm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
|
2015-04-28 22:05:47 +08:00
|
|
|
getI32Imm(Imm & 0xFFFF, dl)),
|
|
|
|
0);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
Two improvements:
1. Codegen this comparison:
if (X == 0x8000)
as:
cmplwi cr0, r3, 32768
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 0
ori r2, r2, 32768
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
2. Codegen this comparison:
if (X == 0x12345678)
as:
xoris r2, r3, 4660
cmplwi cr0, r2, 22136
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 4660
ori r2, r2, 22136
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
llvm-svn: 30509
2006-09-20 12:25:47 +08:00
|
|
|
// For non-equality comparisons, the default code would materialize the
|
|
|
|
// constant, then compare against it, like this:
|
|
|
|
// lis r2, 4660
|
2010-12-24 12:28:06 +08:00
|
|
|
// ori r2, r2, 22136
|
Two improvements:
1. Codegen this comparison:
if (X == 0x8000)
as:
cmplwi cr0, r3, 32768
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 0
ori r2, r2, 32768
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
2. Codegen this comparison:
if (X == 0x12345678)
as:
xoris r2, r3, 4660
cmplwi cr0, r2, 22136
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 4660
ori r2, r2, 22136
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
llvm-svn: 30509
2006-09-20 12:25:47 +08:00
|
|
|
// cmpw cr0, r3, r2
|
|
|
|
// Since we are just comparing for equality, we can emit this instead:
|
|
|
|
// xoris r0,r3,0x1234
|
|
|
|
// cmplwi cr0,r0,0x5678
|
|
|
|
// beq cr0,L6
|
2009-09-26 02:54:59 +08:00
|
|
|
SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS,
|
2015-04-28 22:05:47 +08:00
|
|
|
getI32Imm(Imm >> 16, dl)), 0);
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor,
|
2015-04-28 22:05:47 +08:00
|
|
|
getI32Imm(Imm & 0xFFFF, dl)), 0);
|
Two improvements:
1. Codegen this comparison:
if (X == 0x8000)
as:
cmplwi cr0, r3, 32768
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 0
ori r2, r2, 32768
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
2. Codegen this comparison:
if (X == 0x12345678)
as:
xoris r2, r3, 4660
cmplwi cr0, r2, 22136
bne cr0, LBB1_2 ;cond_next
instead of:
lis r2, 4660
ori r2, r2, 22136
cmpw cr0, r3, r2
bne cr0, LBB1_2 ;cond_next
llvm-svn: 30509
2006-09-20 12:25:47 +08:00
|
|
|
}
|
|
|
|
Opc = PPC::CMPLW;
|
|
|
|
} else if (ISD::isUnsignedIntSetCC(CC)) {
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
|
2015-04-28 22:05:47 +08:00
|
|
|
getI32Imm(Imm & 0xFFFF, dl)), 0);
|
2006-06-27 08:04:13 +08:00
|
|
|
Opc = PPC::CMPLW;
|
|
|
|
} else {
|
|
|
|
short SImm;
|
|
|
|
if (isIntS16Immediate(RHS, SImm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
|
2015-04-28 22:05:47 +08:00
|
|
|
getI32Imm((int)SImm & 0xFFFF,
|
|
|
|
dl)),
|
2006-06-27 08:04:13 +08:00
|
|
|
0);
|
|
|
|
Opc = PPC::CMPW;
|
|
|
|
}
|
2009-08-12 04:47:22 +08:00
|
|
|
} else if (LHS.getValueType() == MVT::i64) {
|
2006-06-27 08:04:13 +08:00
|
|
|
uint64_t Imm;
|
2006-09-20 12:33:27 +08:00
|
|
|
if (CC == ISD::SETEQ || CC == ISD::SETNE) {
|
2008-08-29 05:40:38 +08:00
|
|
|
if (isInt64Immediate(RHS.getNode(), Imm)) {
|
2006-09-20 12:33:27 +08:00
|
|
|
// SETEQ/SETNE comparison with 16-bit immediate, fold it.
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isUInt<16>(Imm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
|
2015-04-28 22:05:47 +08:00
|
|
|
getI32Imm(Imm & 0xFFFF, dl)),
|
|
|
|
0);
|
2006-09-20 12:33:27 +08:00
|
|
|
// If this is a 16-bit signed immediate, fold it.
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isInt<16>(Imm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
|
2015-04-28 22:05:47 +08:00
|
|
|
getI32Imm(Imm & 0xFFFF, dl)),
|
|
|
|
0);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-09-20 12:33:27 +08:00
|
|
|
// For non-equality comparisons, the default code would materialize the
|
|
|
|
// constant, then compare against it, like this:
|
|
|
|
// lis r2, 4660
|
2010-12-24 12:28:06 +08:00
|
|
|
// ori r2, r2, 22136
|
2006-09-20 12:33:27 +08:00
|
|
|
// cmpd cr0, r3, r2
|
|
|
|
// Since we are just comparing for equality, we can emit this instead:
|
|
|
|
// xoris r0,r3,0x1234
|
|
|
|
// cmpldi cr0,r0,0x5678
|
|
|
|
// beq cr0,L6
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isUInt<32>(Imm)) {
|
2009-09-26 02:54:59 +08:00
|
|
|
SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS,
|
2015-04-28 22:05:47 +08:00
|
|
|
getI64Imm(Imm >> 16, dl)), 0);
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor,
|
2015-04-28 22:05:47 +08:00
|
|
|
getI64Imm(Imm & 0xFFFF, dl)),
|
|
|
|
0);
|
2006-09-20 12:33:27 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
Opc = PPC::CMPLD;
|
|
|
|
} else if (ISD::isUnsignedIntSetCC(CC)) {
|
2010-03-30 05:13:41 +08:00
|
|
|
if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
|
2015-04-28 22:05:47 +08:00
|
|
|
getI64Imm(Imm & 0xFFFF, dl)), 0);
|
2006-06-27 08:04:13 +08:00
|
|
|
Opc = PPC::CMPLD;
|
|
|
|
} else {
|
|
|
|
short SImm;
|
|
|
|
if (isIntS16Immediate(RHS, SImm))
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
|
2015-04-28 22:05:47 +08:00
|
|
|
getI64Imm(SImm & 0xFFFF, dl)),
|
2006-06-27 08:04:13 +08:00
|
|
|
0);
|
|
|
|
Opc = PPC::CMPD;
|
|
|
|
}
|
2009-08-12 04:47:22 +08:00
|
|
|
} else if (LHS.getValueType() == MVT::f32) {
|
2006-06-27 08:04:13 +08:00
|
|
|
Opc = PPC::FCMPUS;
|
2005-08-22 02:50:37 +08:00
|
|
|
} else {
|
2009-08-12 04:47:22 +08:00
|
|
|
assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
|
2014-05-22 09:07:24 +08:00
|
|
|
Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
|
2005-08-22 02:50:37 +08:00
|
|
|
}
|
2009-09-26 02:54:59 +08:00
|
|
|
return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
|
2005-08-22 02:50:37 +08:00
|
|
|
}
|
|
|
|
|
2006-11-18 06:10:59 +08:00
|
|
|
static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) {
|
2005-08-22 02:50:37 +08:00
|
|
|
switch (CC) {
|
2006-05-26 00:54:16 +08:00
|
|
|
case ISD::SETUEQ:
|
2008-11-08 06:54:33 +08:00
|
|
|
case ISD::SETONE:
|
|
|
|
case ISD::SETOLE:
|
|
|
|
case ISD::SETOGE:
|
2009-07-15 00:55:14 +08:00
|
|
|
llvm_unreachable("Should be lowered by legalize!");
|
|
|
|
default: llvm_unreachable("Unknown condition!");
|
2008-11-08 06:54:33 +08:00
|
|
|
case ISD::SETOEQ:
|
2006-11-18 06:10:59 +08:00
|
|
|
case ISD::SETEQ: return PPC::PRED_EQ;
|
2006-05-26 00:54:16 +08:00
|
|
|
case ISD::SETUNE:
|
2006-11-18 06:10:59 +08:00
|
|
|
case ISD::SETNE: return PPC::PRED_NE;
|
2008-11-08 06:54:33 +08:00
|
|
|
case ISD::SETOLT:
|
2006-11-18 06:10:59 +08:00
|
|
|
case ISD::SETLT: return PPC::PRED_LT;
|
2005-08-22 02:50:37 +08:00
|
|
|
case ISD::SETULE:
|
2006-11-18 06:10:59 +08:00
|
|
|
case ISD::SETLE: return PPC::PRED_LE;
|
2008-11-08 06:54:33 +08:00
|
|
|
case ISD::SETOGT:
|
2006-11-18 06:10:59 +08:00
|
|
|
case ISD::SETGT: return PPC::PRED_GT;
|
2005-08-22 02:50:37 +08:00
|
|
|
case ISD::SETUGE:
|
2006-11-18 06:10:59 +08:00
|
|
|
case ISD::SETGE: return PPC::PRED_GE;
|
|
|
|
case ISD::SETO: return PPC::PRED_NU;
|
|
|
|
case ISD::SETUO: return PPC::PRED_UN;
|
2008-11-08 06:54:33 +08:00
|
|
|
// These two are invalid for floating point. Assume we have int.
|
|
|
|
case ISD::SETULT: return PPC::PRED_LT;
|
|
|
|
case ISD::SETUGT: return PPC::PRED_GT;
|
2005-08-22 02:50:37 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-08-26 04:08:18 +08:00
|
|
|
/// getCRIdxForSetCC - Return the index of the condition register field
|
|
|
|
/// associated with the SetCC condition, and whether or not the field is
|
|
|
|
/// treated as inverted. That is, lt = 0; ge = 0 inverted.
|
2013-07-03 23:13:30 +08:00
|
|
|
static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) {
|
2008-01-08 14:46:30 +08:00
|
|
|
Invert = false;
|
2005-08-26 04:08:18 +08:00
|
|
|
switch (CC) {
|
2009-07-15 00:55:14 +08:00
|
|
|
default: llvm_unreachable("Unknown condition!");
|
2008-01-08 14:46:30 +08:00
|
|
|
case ISD::SETOLT:
|
|
|
|
case ISD::SETLT: return 0; // Bit #0 = SETOLT
|
|
|
|
case ISD::SETOGT:
|
|
|
|
case ISD::SETGT: return 1; // Bit #1 = SETOGT
|
|
|
|
case ISD::SETOEQ:
|
|
|
|
case ISD::SETEQ: return 2; // Bit #2 = SETOEQ
|
|
|
|
case ISD::SETUO: return 3; // Bit #3 = SETUO
|
2005-08-26 04:08:18 +08:00
|
|
|
case ISD::SETUGE:
|
2008-01-08 14:46:30 +08:00
|
|
|
case ISD::SETGE: Invert = true; return 0; // !Bit #0 = SETUGE
|
2005-08-26 04:08:18 +08:00
|
|
|
case ISD::SETULE:
|
2008-01-08 14:46:30 +08:00
|
|
|
case ISD::SETLE: Invert = true; return 1; // !Bit #1 = SETULE
|
2006-05-26 02:06:16 +08:00
|
|
|
case ISD::SETUNE:
|
2008-01-08 14:46:30 +08:00
|
|
|
case ISD::SETNE: Invert = true; return 2; // !Bit #2 = SETUNE
|
|
|
|
case ISD::SETO: Invert = true; return 3; // !Bit #3 = SETO
|
2010-12-24 12:28:06 +08:00
|
|
|
case ISD::SETUEQ:
|
|
|
|
case ISD::SETOGE:
|
|
|
|
case ISD::SETOLE:
|
2008-11-08 06:54:33 +08:00
|
|
|
case ISD::SETONE:
|
2009-07-15 00:55:14 +08:00
|
|
|
llvm_unreachable("Invalid branch code: should be expanded by legalize");
|
2008-11-08 06:54:33 +08:00
|
|
|
// These are invalid for floating point. Assume integer.
|
|
|
|
case ISD::SETULT: return 0;
|
|
|
|
case ISD::SETUGT: return 1;
|
2005-08-26 04:08:18 +08:00
|
|
|
}
|
|
|
|
}
|
2005-08-22 06:31:09 +08:00
|
|
|
|
2012-10-30 21:50:19 +08:00
|
|
|
// getVCmpInst: return the vector compare instruction for the specified
|
|
|
|
// vector type and condition code. Since this is for altivec specific code,
|
2015-03-04 03:55:45 +08:00
|
|
|
// only support the altivec types (v16i8, v8i16, v4i32, v2i64, and v4f32).
|
2014-08-04 21:13:57 +08:00
|
|
|
static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
|
|
|
|
bool HasVSX, bool &Swap, bool &Negate) {
|
|
|
|
Swap = false;
|
|
|
|
Negate = false;
|
2012-10-30 21:50:19 +08:00
|
|
|
|
2014-08-04 21:13:57 +08:00
|
|
|
if (VecVT.isFloatingPoint()) {
|
|
|
|
/* Handle some cases by swapping input operands. */
|
|
|
|
switch (CC) {
|
|
|
|
case ISD::SETLE: CC = ISD::SETGE; Swap = true; break;
|
|
|
|
case ISD::SETLT: CC = ISD::SETGT; Swap = true; break;
|
|
|
|
case ISD::SETOLE: CC = ISD::SETOGE; Swap = true; break;
|
|
|
|
case ISD::SETOLT: CC = ISD::SETOGT; Swap = true; break;
|
|
|
|
case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break;
|
|
|
|
case ISD::SETUGT: CC = ISD::SETULT; Swap = true; break;
|
|
|
|
default: break;
|
|
|
|
}
|
|
|
|
/* Handle some cases by negating the result. */
|
|
|
|
switch (CC) {
|
|
|
|
case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break;
|
|
|
|
case ISD::SETUNE: CC = ISD::SETOEQ; Negate = true; break;
|
|
|
|
case ISD::SETULE: CC = ISD::SETOGT; Negate = true; break;
|
|
|
|
case ISD::SETULT: CC = ISD::SETOGE; Negate = true; break;
|
|
|
|
default: break;
|
|
|
|
}
|
|
|
|
/* We have instructions implementing the remaining cases. */
|
|
|
|
switch (CC) {
|
|
|
|
case ISD::SETEQ:
|
|
|
|
case ISD::SETOEQ:
|
|
|
|
if (VecVT == MVT::v4f32)
|
|
|
|
return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
|
|
|
|
else if (VecVT == MVT::v2f64)
|
|
|
|
return PPC::XVCMPEQDP;
|
|
|
|
break;
|
|
|
|
case ISD::SETGT:
|
|
|
|
case ISD::SETOGT:
|
|
|
|
if (VecVT == MVT::v4f32)
|
|
|
|
return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP;
|
|
|
|
else if (VecVT == MVT::v2f64)
|
|
|
|
return PPC::XVCMPGTDP;
|
|
|
|
break;
|
|
|
|
case ISD::SETGE:
|
|
|
|
case ISD::SETOGE:
|
|
|
|
if (VecVT == MVT::v4f32)
|
|
|
|
return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP;
|
|
|
|
else if (VecVT == MVT::v2f64)
|
|
|
|
return PPC::XVCMPGEDP;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
llvm_unreachable("Invalid floating-point vector compare condition");
|
|
|
|
} else {
|
|
|
|
/* Handle some cases by swapping input operands. */
|
|
|
|
switch (CC) {
|
|
|
|
case ISD::SETGE: CC = ISD::SETLE; Swap = true; break;
|
|
|
|
case ISD::SETLT: CC = ISD::SETGT; Swap = true; break;
|
|
|
|
case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break;
|
|
|
|
case ISD::SETULT: CC = ISD::SETUGT; Swap = true; break;
|
|
|
|
default: break;
|
|
|
|
}
|
|
|
|
/* Handle some cases by negating the result. */
|
|
|
|
switch (CC) {
|
|
|
|
case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break;
|
|
|
|
case ISD::SETUNE: CC = ISD::SETUEQ; Negate = true; break;
|
|
|
|
case ISD::SETLE: CC = ISD::SETGT; Negate = true; break;
|
|
|
|
case ISD::SETULE: CC = ISD::SETUGT; Negate = true; break;
|
|
|
|
default: break;
|
|
|
|
}
|
|
|
|
/* We have instructions implementing the remaining cases. */
|
|
|
|
switch (CC) {
|
|
|
|
case ISD::SETEQ:
|
|
|
|
case ISD::SETUEQ:
|
|
|
|
if (VecVT == MVT::v16i8)
|
|
|
|
return PPC::VCMPEQUB;
|
|
|
|
else if (VecVT == MVT::v8i16)
|
|
|
|
return PPC::VCMPEQUH;
|
|
|
|
else if (VecVT == MVT::v4i32)
|
|
|
|
return PPC::VCMPEQUW;
|
2015-03-04 03:55:45 +08:00
|
|
|
else if (VecVT == MVT::v2i64)
|
|
|
|
return PPC::VCMPEQUD;
|
2014-08-04 21:13:57 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETGT:
|
|
|
|
if (VecVT == MVT::v16i8)
|
|
|
|
return PPC::VCMPGTSB;
|
|
|
|
else if (VecVT == MVT::v8i16)
|
|
|
|
return PPC::VCMPGTSH;
|
|
|
|
else if (VecVT == MVT::v4i32)
|
|
|
|
return PPC::VCMPGTSW;
|
2015-03-04 03:55:45 +08:00
|
|
|
else if (VecVT == MVT::v2i64)
|
|
|
|
return PPC::VCMPGTSD;
|
2014-08-04 21:13:57 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETUGT:
|
|
|
|
if (VecVT == MVT::v16i8)
|
|
|
|
return PPC::VCMPGTUB;
|
|
|
|
else if (VecVT == MVT::v8i16)
|
|
|
|
return PPC::VCMPGTUH;
|
|
|
|
else if (VecVT == MVT::v4i32)
|
|
|
|
return PPC::VCMPGTUW;
|
2015-03-04 03:55:45 +08:00
|
|
|
else if (VecVT == MVT::v2i64)
|
|
|
|
return PPC::VCMPGTUD;
|
2014-08-04 21:13:57 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
llvm_unreachable("Invalid integer vector compare condition");
|
2012-10-30 21:50:19 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(N);
|
2005-10-07 03:03:35 +08:00
|
|
|
unsigned Imm;
|
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
|
2015-07-09 10:09:04 +08:00
|
|
|
EVT PtrVT =
|
|
|
|
CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
|
2011-06-20 23:28:39 +08:00
|
|
|
bool isPPC64 = (PtrVT == MVT::i64);
|
|
|
|
|
2014-05-22 09:07:24 +08:00
|
|
|
if (!PPCSubTarget->useCRBits() &&
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
isInt32Immediate(N->getOperand(1), Imm)) {
|
2005-10-07 03:03:35 +08:00
|
|
|
// We can codegen setcc op, imm very efficiently compared to a brcond.
|
|
|
|
// Check for those cases here.
|
|
|
|
// setcc op, 0
|
|
|
|
if (Imm == 0) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Op = N->getOperand(0);
|
2005-10-07 03:03:35 +08:00
|
|
|
switch (CC) {
|
2005-10-22 05:17:10 +08:00
|
|
|
default: break;
|
2006-08-27 16:14:06 +08:00
|
|
|
case ISD::SETEQ: {
|
2009-09-26 02:54:59 +08:00
|
|
|
Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Ops[] = { Op, getI32Imm(27, dl), getI32Imm(5, dl),
|
|
|
|
getI32Imm(31, dl) };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
|
|
|
|
return true;
|
2006-08-27 16:14:06 +08:00
|
|
|
}
|
2005-10-22 05:17:10 +08:00
|
|
|
case ISD::SETNE: {
|
2011-06-20 23:28:39 +08:00
|
|
|
if (isPPC64) break;
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue AD =
|
2010-12-21 10:38:05 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
|
2015-04-28 22:05:47 +08:00
|
|
|
Op, getI32Imm(~0U, dl)), 0);
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, AD.getValue(1));
|
|
|
|
return true;
|
2005-10-22 05:17:10 +08:00
|
|
|
}
|
2006-08-27 16:14:06 +08:00
|
|
|
case ISD::SETLT: {
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl),
|
|
|
|
getI32Imm(31, dl) };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
|
|
|
|
return true;
|
2006-08-27 16:14:06 +08:00
|
|
|
}
|
2005-10-22 05:17:10 +08:00
|
|
|
case ISD::SETGT: {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue T =
|
2009-09-26 02:54:59 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0);
|
|
|
|
T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Ops[] = { T, getI32Imm(1, dl), getI32Imm(31, dl),
|
|
|
|
getI32Imm(31, dl) };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
|
|
|
|
return true;
|
2005-10-22 05:17:10 +08:00
|
|
|
}
|
2005-10-07 03:03:35 +08:00
|
|
|
}
|
|
|
|
} else if (Imm == ~0U) { // setcc op, -1
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Op = N->getOperand(0);
|
2005-10-07 03:03:35 +08:00
|
|
|
switch (CC) {
|
2005-10-22 05:17:10 +08:00
|
|
|
default: break;
|
|
|
|
case ISD::SETEQ:
|
2011-06-20 23:28:39 +08:00
|
|
|
if (isPPC64) break;
|
2010-12-21 10:38:05 +08:00
|
|
|
Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
|
2015-04-28 22:05:47 +08:00
|
|
|
Op, getI32Imm(1, dl)), 0);
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
|
|
|
|
SDValue(CurDAG->getMachineNode(PPC::LI, dl,
|
|
|
|
MVT::i32,
|
|
|
|
getI32Imm(0, dl)),
|
|
|
|
0), Op.getValue(1));
|
|
|
|
return true;
|
2005-10-22 05:17:10 +08:00
|
|
|
case ISD::SETNE: {
|
2011-06-20 23:28:39 +08:00
|
|
|
if (isPPC64) break;
|
2009-09-26 02:54:59 +08:00
|
|
|
Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0);
|
2010-12-21 10:38:05 +08:00
|
|
|
SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
|
2015-04-28 22:05:47 +08:00
|
|
|
Op, getI32Imm(~0U, dl));
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0), Op,
|
|
|
|
SDValue(AD, 1));
|
|
|
|
return true;
|
2005-10-22 05:17:10 +08:00
|
|
|
}
|
|
|
|
case ISD::SETLT: {
|
2009-09-26 02:54:59 +08:00
|
|
|
SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op,
|
2015-04-28 22:05:47 +08:00
|
|
|
getI32Imm(1, dl)), 0);
|
2009-09-26 02:54:59 +08:00
|
|
|
SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD,
|
|
|
|
Op), 0);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Ops[] = { AN, getI32Imm(1, dl), getI32Imm(31, dl),
|
|
|
|
getI32Imm(31, dl) };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
|
|
|
|
return true;
|
2005-10-22 05:17:10 +08:00
|
|
|
}
|
2006-08-27 16:14:06 +08:00
|
|
|
case ISD::SETGT: {
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl),
|
|
|
|
getI32Imm(31, dl) };
|
|
|
|
Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, getI32Imm(1, dl));
|
|
|
|
return true;
|
2005-10-07 03:03:35 +08:00
|
|
|
}
|
2006-08-27 16:14:06 +08:00
|
|
|
}
|
2005-10-07 03:03:35 +08:00
|
|
|
}
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2012-10-09 02:59:53 +08:00
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
|
2012-10-30 21:50:19 +08:00
|
|
|
// Altivec Vector compare instructions do not set any CR register by default and
|
|
|
|
// vector compare operations return the same type as the operands.
|
2012-10-09 02:59:53 +08:00
|
|
|
if (LHS.getValueType().isVector()) {
|
[PowerPC] Add support for the QPX vector instruction set
This adds support for the QPX vector instruction set, which is used by the
enhanced A2 cores on the IBM BG/Q supercomputers. QPX vectors are 256 bytes
wide, holding 4 double-precision floating-point values. Boolean values, modeled
here as <4 x i1> are actually also represented as floating-point values
(essentially { -1, 1 } for { false, true }). QPX shares many features with
Altivec and VSX, but is distinct from both of them. One major difference is
that, instead of adding completely-separate vector registers, QPX vector
registers are extensions of the scalar floating-point registers (lane 0 is the
corresponding scalar floating-point value). The operations supported on QPX
vectors mirrors that supported on the scalar floating-point values (with some
additional ones for permutations and logical/comparison operations).
I've been maintaining this support out-of-tree, as part of the bgclang project,
for several years. This is not the entire bgclang patch set, but is most of the
subset that can be cleanly integrated into LLVM proper at this time. Adding
this to the LLVM backend is part of my efforts to rebase bgclang to the current
LLVM trunk, but is independently useful (especially for codes that use LLVM as
a JIT in library form).
The assembler/disassembler test coverage is complete. The CodeGen test coverage
is not, but I've included some tests, and more will be added as follow-up work.
llvm-svn: 230413
2015-02-25 09:06:45 +08:00
|
|
|
if (PPCSubTarget->hasQPX())
|
2016-05-21 05:43:23 +08:00
|
|
|
return false;
|
[PowerPC] Add support for the QPX vector instruction set
This adds support for the QPX vector instruction set, which is used by the
enhanced A2 cores on the IBM BG/Q supercomputers. QPX vectors are 256 bytes
wide, holding 4 double-precision floating-point values. Boolean values, modeled
here as <4 x i1> are actually also represented as floating-point values
(essentially { -1, 1 } for { false, true }). QPX shares many features with
Altivec and VSX, but is distinct from both of them. One major difference is
that, instead of adding completely-separate vector registers, QPX vector
registers are extensions of the scalar floating-point registers (lane 0 is the
corresponding scalar floating-point value). The operations supported on QPX
vectors mirrors that supported on the scalar floating-point values (with some
additional ones for permutations and logical/comparison operations).
I've been maintaining this support out-of-tree, as part of the bgclang project,
for several years. This is not the entire bgclang patch set, but is most of the
subset that can be cleanly integrated into LLVM proper at this time. Adding
this to the LLVM backend is part of my efforts to rebase bgclang to the current
LLVM trunk, but is independently useful (especially for codes that use LLVM as
a JIT in library form).
The assembler/disassembler test coverage is complete. The CodeGen test coverage
is not, but I've included some tests, and more will be added as follow-up work.
llvm-svn: 230413
2015-02-25 09:06:45 +08:00
|
|
|
|
2012-10-30 21:50:19 +08:00
|
|
|
EVT VecVT = LHS.getValueType();
|
2014-08-04 21:13:57 +08:00
|
|
|
bool Swap, Negate;
|
|
|
|
unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC,
|
|
|
|
PPCSubTarget->hasVSX(), Swap, Negate);
|
|
|
|
if (Swap)
|
|
|
|
std::swap(LHS, RHS);
|
|
|
|
|
2015-08-20 11:02:02 +08:00
|
|
|
EVT ResVT = VecVT.changeVectorElementTypeToInteger();
|
2014-08-04 21:13:57 +08:00
|
|
|
if (Negate) {
|
2015-08-20 11:02:02 +08:00
|
|
|
SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0);
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR : PPC::VNOR,
|
|
|
|
ResVT, VCmp, VCmp);
|
|
|
|
return true;
|
2012-10-30 21:50:19 +08:00
|
|
|
}
|
2014-08-04 21:13:57 +08:00
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS);
|
|
|
|
return true;
|
2012-10-09 02:59:53 +08:00
|
|
|
}
|
|
|
|
|
2014-05-22 09:07:24 +08:00
|
|
|
if (PPCSubTarget->useCRBits())
|
2016-05-21 05:43:23 +08:00
|
|
|
return false;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
|
2005-10-07 03:03:35 +08:00
|
|
|
bool Inv;
|
2013-07-03 23:13:30 +08:00
|
|
|
unsigned Idx = getCRIdxForSetCC(CC, Inv);
|
2012-10-09 02:59:53 +08:00
|
|
|
SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue IntCR;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-10-07 03:03:35 +08:00
|
|
|
// Force the ccreg into CR7.
|
2009-08-12 04:47:22 +08:00
|
|
|
SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2014-04-25 13:30:21 +08:00
|
|
|
SDValue InFlag(nullptr, 0); // Null incoming flag value.
|
2010-12-24 12:28:06 +08:00
|
|
|
CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg,
|
2005-12-01 11:50:19 +08:00
|
|
|
InFlag).getValue(1);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
llvm-svn: 185556
2013-07-04 01:05:42 +08:00
|
|
|
IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
|
|
|
|
CCReg), 0);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Ops[] = { IntCR, getI32Imm((32 - (3 - Idx)) & 31, dl),
|
|
|
|
getI32Imm(31, dl), getI32Imm(31, dl) };
|
2016-05-21 05:43:23 +08:00
|
|
|
if (!Inv) {
|
|
|
|
CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
|
|
|
|
return true;
|
|
|
|
}
|
2008-01-08 14:46:30 +08:00
|
|
|
|
|
|
|
// Get the specified bit.
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Tmp =
|
2013-04-20 06:22:57 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1, dl));
|
|
|
|
return true;
|
2005-10-07 03:03:35 +08:00
|
|
|
}
|
2005-10-07 02:56:10 +08:00
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
|
[PowerPC] Make LDtocL and friends invariant loads
LDtocL, and other loads that roughly correspond to the TOC_ENTRY SDAG node,
represent loads from the TOC, which is invariant. As a result, these loads can
be hoisted out of loops, etc. In order to do this, we need to generate
GOT-style MMOs for TOC_ENTRY, which requires treating it as a legitimate memory
intrinsic node type. Once this is done, the MMO transfer is automatically
handled for TableGen-driven instruction selection, and for nodes generated
directly in PPCISelDAGToDAG, we need to transfer the MMOs manually.
Also, we were not transferring MMOs associated with pre-increment loads, so do
that too.
Lastly, this fixes an exposed bug where R30 was not added as a defined operand of
UpdateGBR.
This problem was highlighted by an example (used to generate the test case)
posted to llvmdev by Francois Pichet.
llvm-svn: 230553
2015-02-26 05:36:59 +08:00
|
|
|
// Transfer memoperands.
|
|
|
|
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
|
|
|
|
MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
|
|
|
|
cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
|
|
|
|
}
|
|
|
|
|
2005-10-07 03:07:45 +08:00
|
|
|
|
2005-08-18 03:33:03 +08:00
|
|
|
// Select - Convert the specified operand from a target-independent to a
|
|
|
|
// target-specific node if it hasn't already been changed.
|
2016-05-21 05:43:23 +08:00
|
|
|
void PPCDAGToDAGISel::Select(SDNode *N) {
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(N);
|
2013-09-22 16:21:56 +08:00
|
|
|
if (N->isMachineOpcode()) {
|
|
|
|
N->setNodeId(-1);
|
2016-05-21 05:43:23 +08:00
|
|
|
return; // Already selected.
|
2013-09-22 16:21:56 +08:00
|
|
|
}
|
2005-09-29 08:59:32 +08:00
|
|
|
|
2014-09-02 14:23:54 +08:00
|
|
|
// In case any misguided DAG-level optimizations form an ADD with a
|
|
|
|
// TargetConstant operand, crash here instead of miscompiling (by selecting
|
|
|
|
// an r+r add instead of some kind of r+i add).
|
|
|
|
if (N->getOpcode() == ISD::ADD &&
|
|
|
|
N->getOperand(1).getOpcode() == ISD::TargetConstant)
|
|
|
|
llvm_unreachable("Invalid ADD with TargetConstant operand");
|
|
|
|
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
// Try matching complex bit permutations before doing anything else.
|
2016-05-21 05:43:23 +08:00
|
|
|
if (tryBitPermutation(N))
|
|
|
|
return;
|
[PowerPC] Improve instruction selection bit-permuting operations (32-bit)
The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.
Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).
The result is that instead of lowering an i32 bswap as:
rlwinm 5, 3, 24, 16, 23
rlwinm 4, 3, 24, 0, 7
rlwimi 4, 3, 8, 8, 15
rlwimi 5, 3, 8, 24, 31
rlwimi 4, 5, 0, 16, 31
we now produce:
rlwinm 4, 3, 8, 0, 31
rlwimi 4, 3, 24, 16, 23
rlwimi 4, 3, 24, 0, 7
and for the 'test6' example in the PowerPC/README.txt file:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
we used to produce:
lis 4, 255
rlwinm 3, 3, 16, 0, 31
ori 4, 4, 255
and 3, 3, 4
and now we produce:
rlwinm 4, 3, 16, 24, 31
rlwimi 4, 3, 16, 8, 15
and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.
This commit does not include instruction-selection for i64 operations, those
will come later.
llvm-svn: 224318
2014-12-16 13:51:41 +08:00
|
|
|
|
2005-08-18 03:33:03 +08:00
|
|
|
switch (N->getOpcode()) {
|
2005-09-08 07:45:15 +08:00
|
|
|
default: break;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-12-12 21:23:43 +08:00
|
|
|
case ISD::Constant: {
|
2016-05-21 05:43:23 +08:00
|
|
|
if (N->getValueType(0) == MVT::i64) {
|
|
|
|
ReplaceNode(N, getInt64(CurDAG, N));
|
|
|
|
return;
|
|
|
|
}
|
2006-12-12 21:23:43 +08:00
|
|
|
break;
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
case ISD::SETCC: {
|
2016-05-21 05:43:23 +08:00
|
|
|
if (trySETCC(N))
|
|
|
|
return;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
}
|
2006-02-09 08:37:58 +08:00
|
|
|
case PPCISD::GlobalBaseReg:
|
2016-05-21 05:43:23 +08:00
|
|
|
ReplaceNode(N, getGlobalBaseReg());
|
|
|
|
return;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
[PowerPC] Better lowering for add/or of a FrameIndex
If we have an add (or an or that is really an add), where one operand is a
FrameIndex and the other operand is a small constant, we can combine the
lowering of the FrameIndex (which is lowered as an add of the FI and a zero
offset) with the constant operand.
Amusingly, this is an old potential improvement entry from
lib/Target/PowerPC/README.txt which had never been resolved. In short, we used
to lower:
%X = alloca { i32, i32 }
%Y = getelementptr {i32,i32}* %X, i32 0, i32 1
ret i32* %Y
as:
addi 3, 1, -8
ori 3, 3, 4
blr
and now we produce:
addi 3, 1, -4
blr
which is much more sensible.
llvm-svn: 224071
2014-12-12 06:51:06 +08:00
|
|
|
case ISD::FrameIndex:
|
2016-05-21 05:43:23 +08:00
|
|
|
selectFrameIndex(N, N);
|
|
|
|
return;
|
2006-03-26 18:06:40 +08:00
|
|
|
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
llvm-svn: 185556
2013-07-04 01:05:42 +08:00
|
|
|
case PPCISD::MFOCRF: {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue InFlag = N->getOperand(1);
|
2016-05-21 05:43:23 +08:00
|
|
|
ReplaceNode(N, CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32,
|
|
|
|
N->getOperand(0), InFlag));
|
|
|
|
return;
|
2006-03-26 18:06:40 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2014-12-03 06:01:00 +08:00
|
|
|
case PPCISD::READ_TIME_BASE: {
|
2016-05-21 05:43:23 +08:00
|
|
|
ReplaceNode(N, CurDAG->getMachineNode(PPC::ReadTB, dl, MVT::i32, MVT::i32,
|
|
|
|
MVT::Other, N->getOperand(0)));
|
|
|
|
return;
|
2014-12-03 06:01:00 +08:00
|
|
|
}
|
|
|
|
|
2014-12-12 02:37:52 +08:00
|
|
|
case PPCISD::SRA_ADDZE: {
|
|
|
|
SDValue N0 = N->getOperand(0);
|
|
|
|
SDValue ShiftAmt =
|
|
|
|
CurDAG->getTargetConstant(*cast<ConstantSDNode>(N->getOperand(1))->
|
2015-04-28 22:05:47 +08:00
|
|
|
getConstantIntValue(), dl,
|
|
|
|
N->getValueType(0));
|
2014-12-12 02:37:52 +08:00
|
|
|
if (N->getValueType(0) == MVT::i64) {
|
|
|
|
SDNode *Op =
|
|
|
|
CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, MVT::Glue,
|
|
|
|
N0, ShiftAmt);
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::ADDZE8, MVT::i64, SDValue(Op, 0),
|
|
|
|
SDValue(Op, 1));
|
|
|
|
return;
|
2014-12-12 02:37:52 +08:00
|
|
|
} else {
|
|
|
|
assert(N->getValueType(0) == MVT::i32 &&
|
|
|
|
"Expecting i64 or i32 in PPCISD::SRA_ADDZE");
|
|
|
|
SDNode *Op =
|
|
|
|
CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue,
|
|
|
|
N0, ShiftAmt);
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, SDValue(Op, 0),
|
|
|
|
SDValue(Op, 1));
|
|
|
|
return;
|
2005-08-26 01:50:06 +08:00
|
|
|
}
|
2005-08-26 06:04:30 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-11-10 10:08:47 +08:00
|
|
|
case ISD::LOAD: {
|
|
|
|
// Handle preincrement loads.
|
2010-01-05 09:24:18 +08:00
|
|
|
LoadSDNode *LD = cast<LoadSDNode>(N);
|
2009-08-11 06:56:29 +08:00
|
|
|
EVT LoadedVT = LD->getMemoryVT();
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-11-10 10:08:47 +08:00
|
|
|
// Normal loads are handled by code generated from the .td file.
|
|
|
|
if (LD->getAddressingMode() != ISD::PRE_INC)
|
|
|
|
break;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Offset = LD->getOffset();
|
2013-03-22 22:58:17 +08:00
|
|
|
if (Offset.getOpcode() == ISD::TargetConstant ||
|
allow the offset of a preinc'd load to be the low-part of a global. This
produces this clever code:
_millisecs:
lis r2, ha16(_Time.1182)
lwzu r3, lo16(_Time.1182)(r2)
lwz r2, 4(r2)
addic r4, r2, 1
addze r3, r3
blr
instead of this:
_millisecs:
lis r2, ha16(_Time.1182)
la r3, lo16(_Time.1182)(r2)
lwz r2, lo16(_Time.1182)(r2)
lwz r3, 4(r3)
addic r4, r3, 1
addze r3, r2
blr
for:
long %millisecs() {
%tmp = load long* %Time.1182 ; <long> [#uses=1]
%tmp1 = add long %tmp, 1 ; <long> [#uses=1]
ret long %tmp1
}
llvm-svn: 31673
2006-11-11 12:53:30 +08:00
|
|
|
Offset.getOpcode() == ISD::TargetGlobalAddress) {
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2006-11-16 03:55:13 +08:00
|
|
|
unsigned Opcode;
|
|
|
|
bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
|
2009-08-12 04:47:22 +08:00
|
|
|
if (LD->getValueType(0) != MVT::i64) {
|
2006-11-16 03:55:13 +08:00
|
|
|
// Handle PPC32 integer and normal FP loads.
|
2009-08-12 04:47:22 +08:00
|
|
|
assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
|
|
|
|
switch (LoadedVT.getSimpleVT().SimpleTy) {
|
2009-07-15 00:55:14 +08:00
|
|
|
default: llvm_unreachable("Invalid PPC load type!");
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::f64: Opcode = PPC::LFDU; break;
|
|
|
|
case MVT::f32: Opcode = PPC::LFSU; break;
|
|
|
|
case MVT::i32: Opcode = PPC::LWZU; break;
|
|
|
|
case MVT::i16: Opcode = isSExt ? PPC::LHAU : PPC::LHZU; break;
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opcode = PPC::LBZU; break;
|
2006-11-16 03:55:13 +08:00
|
|
|
}
|
|
|
|
} else {
|
2009-08-12 04:47:22 +08:00
|
|
|
assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
|
|
|
|
assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
|
|
|
|
switch (LoadedVT.getSimpleVT().SimpleTy) {
|
2009-07-15 00:55:14 +08:00
|
|
|
default: llvm_unreachable("Invalid PPC load type!");
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i64: Opcode = PPC::LDU; break;
|
|
|
|
case MVT::i32: Opcode = PPC::LWZU8; break;
|
|
|
|
case MVT::i16: Opcode = isSExt ? PPC::LHAU8 : PPC::LHZU8; break;
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opcode = PPC::LBZU8; break;
|
2006-11-16 03:55:13 +08:00
|
|
|
}
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Chain = LD->getChain();
|
|
|
|
SDValue Base = LD->getBasePtr();
|
|
|
|
SDValue Ops[] = { Offset, Base, Chain };
|
2016-05-21 05:43:23 +08:00
|
|
|
SDNode *MN = CurDAG->getMachineNode(
|
|
|
|
Opcode, dl, LD->getValueType(0),
|
|
|
|
PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other, Ops);
|
|
|
|
transferMemOperands(N, MN);
|
|
|
|
ReplaceNode(N, MN);
|
|
|
|
return;
|
2006-11-10 10:08:47 +08:00
|
|
|
} else {
|
2012-06-20 23:43:03 +08:00
|
|
|
unsigned Opcode;
|
|
|
|
bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
|
|
|
|
if (LD->getValueType(0) != MVT::i64) {
|
|
|
|
// Handle PPC32 integer and normal FP loads.
|
|
|
|
assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
|
|
|
|
switch (LoadedVT.getSimpleVT().SimpleTy) {
|
|
|
|
default: llvm_unreachable("Invalid PPC load type!");
|
[PowerPC] Add support for the QPX vector instruction set
This adds support for the QPX vector instruction set, which is used by the
enhanced A2 cores on the IBM BG/Q supercomputers. QPX vectors are 256 bytes
wide, holding 4 double-precision floating-point values. Boolean values, modeled
here as <4 x i1> are actually also represented as floating-point values
(essentially { -1, 1 } for { false, true }). QPX shares many features with
Altivec and VSX, but is distinct from both of them. One major difference is
that, instead of adding completely-separate vector registers, QPX vector
registers are extensions of the scalar floating-point registers (lane 0 is the
corresponding scalar floating-point value). The operations supported on QPX
vectors mirrors that supported on the scalar floating-point values (with some
additional ones for permutations and logical/comparison operations).
I've been maintaining this support out-of-tree, as part of the bgclang project,
for several years. This is not the entire bgclang patch set, but is most of the
subset that can be cleanly integrated into LLVM proper at this time. Adding
this to the LLVM backend is part of my efforts to rebase bgclang to the current
LLVM trunk, but is independently useful (especially for codes that use LLVM as
a JIT in library form).
The assembler/disassembler test coverage is complete. The CodeGen test coverage
is not, but I've included some tests, and more will be added as follow-up work.
llvm-svn: 230413
2015-02-25 09:06:45 +08:00
|
|
|
case MVT::v4f64: Opcode = PPC::QVLFDUX; break; // QPX
|
|
|
|
case MVT::v4f32: Opcode = PPC::QVLFSUX; break; // QPX
|
2012-06-20 23:43:03 +08:00
|
|
|
case MVT::f64: Opcode = PPC::LFDUX; break;
|
|
|
|
case MVT::f32: Opcode = PPC::LFSUX; break;
|
|
|
|
case MVT::i32: Opcode = PPC::LWZUX; break;
|
|
|
|
case MVT::i16: Opcode = isSExt ? PPC::LHAUX : PPC::LHZUX; break;
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opcode = PPC::LBZUX; break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
|
|
|
|
assert((!isSExt || LoadedVT == MVT::i16 || LoadedVT == MVT::i32) &&
|
|
|
|
"Invalid sext update load");
|
|
|
|
switch (LoadedVT.getSimpleVT().SimpleTy) {
|
|
|
|
default: llvm_unreachable("Invalid PPC load type!");
|
|
|
|
case MVT::i64: Opcode = PPC::LDUX; break;
|
|
|
|
case MVT::i32: Opcode = isSExt ? PPC::LWAUX : PPC::LWZUX8; break;
|
|
|
|
case MVT::i16: Opcode = isSExt ? PPC::LHAUX8 : PPC::LHZUX8; break;
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opcode = PPC::LBZUX8; break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue Chain = LD->getChain();
|
|
|
|
SDValue Base = LD->getBasePtr();
|
2013-03-22 22:58:48 +08:00
|
|
|
SDValue Ops[] = { Base, Offset, Chain };
|
2016-05-21 05:43:23 +08:00
|
|
|
SDNode *MN = CurDAG->getMachineNode(
|
|
|
|
Opcode, dl, LD->getValueType(0),
|
|
|
|
PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other, Ops);
|
|
|
|
transferMemOperands(N, MN);
|
|
|
|
ReplaceNode(N, MN);
|
|
|
|
return;
|
2006-11-10 10:08:47 +08:00
|
|
|
}
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-08-18 15:30:46 +08:00
|
|
|
case ISD::AND: {
|
2006-09-22 13:01:56 +08:00
|
|
|
unsigned Imm, Imm2, SH, MB, ME;
|
2012-08-28 10:10:15 +08:00
|
|
|
uint64_t Imm64;
|
2006-09-22 13:01:56 +08:00
|
|
|
|
2005-08-18 15:30:46 +08:00
|
|
|
// If this is an and of a value rotated between 0 and 31 bits and then and'd
|
|
|
|
// with a mask, emit rlwinm
|
2006-06-27 08:04:13 +08:00
|
|
|
if (isInt32Immediate(N->getOperand(1), Imm) &&
|
2008-08-29 05:40:38 +08:00
|
|
|
isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Val = N->getOperand(0).getOperand(0);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl),
|
|
|
|
getI32Imm(ME, dl) };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
|
|
|
|
return;
|
2005-08-18 15:30:46 +08:00
|
|
|
}
|
2006-09-22 13:01:56 +08:00
|
|
|
// If this is just a masked value where the input is not handled above, and
|
|
|
|
// is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
|
|
|
|
if (isInt32Immediate(N->getOperand(1), Imm) &&
|
2010-12-24 12:28:06 +08:00
|
|
|
isRunOfOnes(Imm, MB, ME) &&
|
2006-09-22 13:01:56 +08:00
|
|
|
N->getOperand(0).getOpcode() != ISD::ROTL) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Val = N->getOperand(0);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Ops[] = { Val, getI32Imm(0, dl), getI32Imm(MB, dl),
|
|
|
|
getI32Imm(ME, dl) };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
|
|
|
|
return;
|
2006-09-22 13:01:56 +08:00
|
|
|
}
|
2012-08-28 10:10:15 +08:00
|
|
|
// If this is a 64-bit zero-extension mask, emit rldicl.
|
|
|
|
if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
|
|
|
|
isMask_64(Imm64)) {
|
|
|
|
SDValue Val = N->getOperand(0);
|
2015-02-12 23:35:40 +08:00
|
|
|
MB = 64 - countTrailingOnes(Imm64);
|
PPC: Optimize rldicl generation for masked shifts
Masking operations (where only some number of the low bits are being kept) are
selected to rldicl(x, 0, mb). If x is a logical right shift (which would become
rldicl(y, 64-n, n)), we might be able to fold the two instructions together:
rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb) for n <= mb
The right shift is really a left rotate followed by a mask, and if the explicit
mask is a more-restrictive sub-mask of the mask implied by the shift, only one
rldicl is needed.
llvm-svn: 195185
2013-11-20 09:10:15 +08:00
|
|
|
SH = 0;
|
|
|
|
|
|
|
|
// If the operand is a logical right shift, we can fold it into this
|
|
|
|
// instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb)
|
|
|
|
// for n <= mb. The right shift is really a left rotate followed by a
|
|
|
|
// mask, and this mask is a more-restrictive sub-mask of the mask implied
|
|
|
|
// by the shift.
|
|
|
|
if (Val.getOpcode() == ISD::SRL &&
|
|
|
|
isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) {
|
|
|
|
assert(Imm < 64 && "Illegal shift amount");
|
|
|
|
Val = Val.getOperand(0);
|
|
|
|
SH = 64 - Imm;
|
|
|
|
}
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
|
|
|
|
return;
|
2012-08-28 10:10:15 +08:00
|
|
|
}
|
2006-09-22 13:01:56 +08:00
|
|
|
// AND X, 0 -> 0, not "rlwinm 32".
|
|
|
|
if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
|
2008-07-28 05:46:04 +08:00
|
|
|
ReplaceUses(SDValue(N, 0), N->getOperand(1));
|
2016-05-21 05:43:23 +08:00
|
|
|
return;
|
2006-09-22 13:01:56 +08:00
|
|
|
}
|
2005-12-24 09:00:15 +08:00
|
|
|
// ISD::OR doesn't get all the bitfield insertion fun.
|
2015-09-05 08:02:59 +08:00
|
|
|
// (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) might be a
|
|
|
|
// bitfield insert.
|
2010-12-24 12:28:06 +08:00
|
|
|
if (isInt32Immediate(N->getOperand(1), Imm) &&
|
2005-12-24 09:00:15 +08:00
|
|
|
N->getOperand(0).getOpcode() == ISD::OR &&
|
2006-06-27 08:04:13 +08:00
|
|
|
isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) {
|
2015-09-05 08:02:59 +08:00
|
|
|
// The idea here is to check whether this is equivalent to:
|
|
|
|
// (c1 & m) | (x & ~m)
|
|
|
|
// where m is a run-of-ones mask. The logic here is that, for each bit in
|
|
|
|
// c1 and c2:
|
|
|
|
// - if both are 1, then the output will be 1.
|
|
|
|
// - if both are 0, then the output will be 0.
|
|
|
|
// - if the bit in c1 is 0, and the bit in c2 is 1, then the output will
|
|
|
|
// come from x.
|
|
|
|
// - if the bit in c1 is 1, and the bit in c2 is 0, then the output will
|
|
|
|
// be 0.
|
|
|
|
// If that last condition is never the case, then we can form m from the
|
|
|
|
// bits that are the same between c1 and c2.
|
2006-01-06 02:32:49 +08:00
|
|
|
unsigned MB, ME;
|
2015-09-05 08:02:59 +08:00
|
|
|
if (isRunOfOnes(~(Imm^Imm2), MB, ME) && !(~Imm & Imm2)) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { N->getOperand(0).getOperand(0),
|
2006-08-27 16:14:06 +08:00
|
|
|
N->getOperand(0).getOperand(1),
|
2015-04-28 22:05:47 +08:00
|
|
|
getI32Imm(0, dl), getI32Imm(MB, dl),
|
|
|
|
getI32Imm(ME, dl) };
|
2016-05-21 05:43:23 +08:00
|
|
|
ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops));
|
|
|
|
return;
|
2005-12-24 09:00:15 +08:00
|
|
|
}
|
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-09-30 07:33:31 +08:00
|
|
|
// Other cases are autogenerated.
|
|
|
|
break;
|
2005-08-18 15:30:46 +08:00
|
|
|
}
|
[PowerPC] Better lowering for add/or of a FrameIndex
If we have an add (or an or that is really an add), where one operand is a
FrameIndex and the other operand is a small constant, we can combine the
lowering of the FrameIndex (which is lowered as an add of the FI and a zero
offset) with the constant operand.
Amusingly, this is an old potential improvement entry from
lib/Target/PowerPC/README.txt which had never been resolved. In short, we used
to lower:
%X = alloca { i32, i32 }
%Y = getelementptr {i32,i32}* %X, i32 0, i32 1
ret i32* %Y
as:
addi 3, 1, -8
ori 3, 3, 4
blr
and now we produce:
addi 3, 1, -4
blr
which is much more sensible.
llvm-svn: 224071
2014-12-12 06:51:06 +08:00
|
|
|
case ISD::OR: {
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getValueType(0) == MVT::i32)
|
2016-05-21 05:43:23 +08:00
|
|
|
if (tryBitfieldInsert(N))
|
|
|
|
return;
|
2010-12-24 12:28:06 +08:00
|
|
|
|
[PowerPC] Better lowering for add/or of a FrameIndex
If we have an add (or an or that is really an add), where one operand is a
FrameIndex and the other operand is a small constant, we can combine the
lowering of the FrameIndex (which is lowered as an add of the FI and a zero
offset) with the constant operand.
Amusingly, this is an old potential improvement entry from
lib/Target/PowerPC/README.txt which had never been resolved. In short, we used
to lower:
%X = alloca { i32, i32 }
%Y = getelementptr {i32,i32}* %X, i32 0, i32 1
ret i32* %Y
as:
addi 3, 1, -8
ori 3, 3, 4
blr
and now we produce:
addi 3, 1, -4
blr
which is much more sensible.
llvm-svn: 224071
2014-12-12 06:51:06 +08:00
|
|
|
short Imm;
|
|
|
|
if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
|
|
|
|
isIntS16Immediate(N->getOperand(1), Imm)) {
|
|
|
|
APInt LHSKnownZero, LHSKnownOne;
|
|
|
|
CurDAG->computeKnownBits(N->getOperand(0), LHSKnownZero, LHSKnownOne);
|
|
|
|
|
|
|
|
// If this is equivalent to an add, then we can fold it with the
|
|
|
|
// FrameIndex calculation.
|
2016-05-21 05:43:23 +08:00
|
|
|
if ((LHSKnownZero.getZExtValue()|~(uint64_t)Imm) == ~0ULL) {
|
|
|
|
selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
|
|
|
|
return;
|
|
|
|
}
|
[PowerPC] Better lowering for add/or of a FrameIndex
If we have an add (or an or that is really an add), where one operand is a
FrameIndex and the other operand is a small constant, we can combine the
lowering of the FrameIndex (which is lowered as an add of the FI and a zero
offset) with the constant operand.
Amusingly, this is an old potential improvement entry from
lib/Target/PowerPC/README.txt which had never been resolved. In short, we used
to lower:
%X = alloca { i32, i32 }
%Y = getelementptr {i32,i32}* %X, i32 0, i32 1
ret i32* %Y
as:
addi 3, 1, -8
ori 3, 3, 4
blr
and now we produce:
addi 3, 1, -4
blr
which is much more sensible.
llvm-svn: 224071
2014-12-12 06:51:06 +08:00
|
|
|
}
|
|
|
|
|
2005-09-30 07:33:31 +08:00
|
|
|
// Other cases are autogenerated.
|
|
|
|
break;
|
[PowerPC] Better lowering for add/or of a FrameIndex
If we have an add (or an or that is really an add), where one operand is a
FrameIndex and the other operand is a small constant, we can combine the
lowering of the FrameIndex (which is lowered as an add of the FI and a zero
offset) with the constant operand.
Amusingly, this is an old potential improvement entry from
lib/Target/PowerPC/README.txt which had never been resolved. In short, we used
to lower:
%X = alloca { i32, i32 }
%Y = getelementptr {i32,i32}* %X, i32 0, i32 1
ret i32* %Y
as:
addi 3, 1, -8
ori 3, 3, 4
blr
and now we produce:
addi 3, 1, -4
blr
which is much more sensible.
llvm-svn: 224071
2014-12-12 06:51:06 +08:00
|
|
|
}
|
|
|
|
case ISD::ADD: {
|
|
|
|
short Imm;
|
|
|
|
if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
|
2016-05-21 05:43:23 +08:00
|
|
|
isIntS16Immediate(N->getOperand(1), Imm)) {
|
|
|
|
selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
|
|
|
|
return;
|
|
|
|
}
|
[PowerPC] Better lowering for add/or of a FrameIndex
If we have an add (or an or that is really an add), where one operand is a
FrameIndex and the other operand is a small constant, we can combine the
lowering of the FrameIndex (which is lowered as an add of the FI and a zero
offset) with the constant operand.
Amusingly, this is an old potential improvement entry from
lib/Target/PowerPC/README.txt which had never been resolved. In short, we used
to lower:
%X = alloca { i32, i32 }
%Y = getelementptr {i32,i32}* %X, i32 0, i32 1
ret i32* %Y
as:
addi 3, 1, -8
ori 3, 3, 4
blr
and now we produce:
addi 3, 1, -4
blr
which is much more sensible.
llvm-svn: 224071
2014-12-12 06:51:06 +08:00
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
2005-08-19 07:38:00 +08:00
|
|
|
case ISD::SHL: {
|
|
|
|
unsigned Imm, SH, MB, ME;
|
2008-08-29 05:40:38 +08:00
|
|
|
if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
|
2005-10-20 02:42:01 +08:00
|
|
|
isRotateAndMask(N, Imm, true, SH, MB, ME)) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { N->getOperand(0).getOperand(0),
|
2015-04-28 22:05:47 +08:00
|
|
|
getI32Imm(SH, dl), getI32Imm(MB, dl),
|
|
|
|
getI32Imm(ME, dl) };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
|
|
|
|
return;
|
Woo, it kinda works. We now generate this atrociously bad, but correct,
code for long long foo(long long a, long long b) { return a + b; }
_foo:
or r2, r3, r3
or r3, r4, r4
or r4, r5, r5
or r5, r6, r6
rldicr r2, r2, 32, 31
rldicl r3, r3, 0, 32
rldicr r4, r4, 32, 31
rldicl r5, r5, 0, 32
or r2, r3, r2
or r3, r5, r4
add r4, r3, r2
rldicl r2, r4, 32, 32
or r4, r4, r4
or r3, r2, r2
blr
llvm-svn: 23809
2005-10-19 09:12:32 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-10-20 02:42:01 +08:00
|
|
|
// Other cases are autogenerated.
|
|
|
|
break;
|
2005-08-19 07:38:00 +08:00
|
|
|
}
|
|
|
|
case ISD::SRL: {
|
|
|
|
unsigned Imm, SH, MB, ME;
|
2008-08-29 05:40:38 +08:00
|
|
|
if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
|
2010-12-24 12:28:06 +08:00
|
|
|
isRotateAndMask(N, Imm, true, SH, MB, ME)) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { N->getOperand(0).getOperand(0),
|
2015-04-28 22:05:47 +08:00
|
|
|
getI32Imm(SH, dl), getI32Imm(MB, dl),
|
|
|
|
getI32Imm(ME, dl) };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
|
|
|
|
return;
|
Woo, it kinda works. We now generate this atrociously bad, but correct,
code for long long foo(long long a, long long b) { return a + b; }
_foo:
or r2, r3, r3
or r3, r4, r4
or r4, r5, r5
or r5, r6, r6
rldicr r2, r2, 32, 31
rldicl r3, r3, 0, 32
rldicr r4, r4, 32, 31
rldicl r5, r5, 0, 32
or r2, r3, r2
or r3, r5, r4
add r4, r3, r2
rldicl r2, r4, 32, 32
or r4, r4, r4
or r3, r2, r2
blr
llvm-svn: 23809
2005-10-19 09:12:32 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2005-10-20 02:42:01 +08:00
|
|
|
// Other cases are autogenerated.
|
|
|
|
break;
|
2005-08-19 07:38:00 +08:00
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
// FIXME: Remove this once the ANDI glue bug is fixed:
|
|
|
|
case PPCISD::ANDIo_1_EQ_BIT:
|
|
|
|
case PPCISD::ANDIo_1_GT_BIT: {
|
|
|
|
if (!ANDIGlueBug)
|
|
|
|
break;
|
|
|
|
|
|
|
|
EVT InVT = N->getOperand(0).getValueType();
|
|
|
|
assert((InVT == MVT::i64 || InVT == MVT::i32) &&
|
|
|
|
"Invalid input type for ANDIo_1_EQ_BIT");
|
|
|
|
|
|
|
|
unsigned Opcode = (InVT == MVT::i64) ? PPC::ANDIo8 : PPC::ANDIo;
|
|
|
|
SDValue AndI(CurDAG->getMachineNode(Opcode, dl, InVT, MVT::Glue,
|
|
|
|
N->getOperand(0),
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(1, dl, InVT)),
|
|
|
|
0);
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32);
|
|
|
|
SDValue SRIdxVal =
|
|
|
|
CurDAG->getTargetConstant(N->getOpcode() == PPCISD::ANDIo_1_EQ_BIT ?
|
2015-04-28 22:05:47 +08:00
|
|
|
PPC::sub_eq : PPC::sub_gt, dl, MVT::i32);
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, TargetOpcode::EXTRACT_SUBREG, MVT::i1, CR0Reg,
|
|
|
|
SRIdxVal, SDValue(AndI.getNode(), 1) /* glue */);
|
|
|
|
return;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
}
|
2005-08-27 02:46:49 +08:00
|
|
|
case ISD::SELECT_CC: {
|
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
|
2015-07-09 10:09:04 +08:00
|
|
|
EVT PtrVT =
|
|
|
|
CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
|
2011-06-20 23:28:39 +08:00
|
|
|
bool isPPC64 = (PtrVT == MVT::i64);
|
2010-12-24 12:28:06 +08:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
// If this is a select of i1 operands, we'll pattern match it.
|
2014-05-22 09:07:24 +08:00
|
|
|
if (PPCSubTarget->useCRBits() &&
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
N->getOperand(0).getValueType() == MVT::i1)
|
|
|
|
break;
|
|
|
|
|
2006-06-27 08:04:13 +08:00
|
|
|
// Handle the setcc cases here. select_cc lhs, 0, 1, 0, cc
|
2011-06-20 23:28:39 +08:00
|
|
|
if (!isPPC64)
|
|
|
|
if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
|
|
|
|
if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
|
|
|
|
if (ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
|
|
|
|
if (N1C->isNullValue() && N3C->isNullValue() &&
|
|
|
|
N2C->getZExtValue() == 1ULL && CC == ISD::SETNE &&
|
|
|
|
// FIXME: Implement this optzn for PPC64.
|
|
|
|
N->getValueType(0) == MVT::i32) {
|
|
|
|
SDNode *Tmp =
|
|
|
|
CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
|
2015-04-28 22:05:47 +08:00
|
|
|
N->getOperand(0), getI32Imm(~0U, dl));
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(Tmp, 0),
|
|
|
|
N->getOperand(0), SDValue(Tmp, 1));
|
|
|
|
return;
|
2011-06-20 23:28:39 +08:00
|
|
|
}
|
2005-08-27 05:23:58 +08:00
|
|
|
|
2009-02-07 03:16:40 +08:00
|
|
|
SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl);
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
|
|
|
|
if (N->getValueType(0) == MVT::i1) {
|
|
|
|
// An i1 select is: (c & t) | (!c & f).
|
|
|
|
bool Inv;
|
|
|
|
unsigned Idx = getCRIdxForSetCC(CC, Inv);
|
|
|
|
|
|
|
|
unsigned SRI;
|
|
|
|
switch (Idx) {
|
|
|
|
default: llvm_unreachable("Invalid CC index");
|
|
|
|
case 0: SRI = PPC::sub_lt; break;
|
|
|
|
case 1: SRI = PPC::sub_gt; break;
|
|
|
|
case 2: SRI = PPC::sub_eq; break;
|
|
|
|
case 3: SRI = PPC::sub_un; break;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue CCBit = CurDAG->getTargetExtractSubreg(SRI, dl, MVT::i1, CCReg);
|
|
|
|
|
|
|
|
SDValue NotCCBit(CurDAG->getMachineNode(PPC::CRNOR, dl, MVT::i1,
|
|
|
|
CCBit, CCBit), 0);
|
|
|
|
SDValue C = Inv ? NotCCBit : CCBit,
|
|
|
|
NotC = Inv ? CCBit : NotCCBit;
|
|
|
|
|
|
|
|
SDValue CAndT(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1,
|
|
|
|
C, N->getOperand(2)), 0);
|
|
|
|
SDValue NotCAndF(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1,
|
|
|
|
NotC, N->getOperand(3)), 0);
|
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::CROR, MVT::i1, CAndT, NotCAndF);
|
|
|
|
return;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
}
|
|
|
|
|
2006-11-18 06:10:59 +08:00
|
|
|
unsigned BROpc = getPredicateForSetCC(CC);
|
2005-08-27 05:23:58 +08:00
|
|
|
|
2005-10-01 09:35:02 +08:00
|
|
|
unsigned SelectCCOp;
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getValueType(0) == MVT::i32)
|
2006-06-27 08:04:13 +08:00
|
|
|
SelectCCOp = PPC::SELECT_CC_I4;
|
2009-08-12 04:47:22 +08:00
|
|
|
else if (N->getValueType(0) == MVT::i64)
|
2006-06-27 08:04:13 +08:00
|
|
|
SelectCCOp = PPC::SELECT_CC_I8;
|
2009-08-12 04:47:22 +08:00
|
|
|
else if (N->getValueType(0) == MVT::f32)
|
2015-05-08 02:24:05 +08:00
|
|
|
if (PPCSubTarget->hasP8Vector())
|
|
|
|
SelectCCOp = PPC::SELECT_CC_VSSRC;
|
|
|
|
else
|
|
|
|
SelectCCOp = PPC::SELECT_CC_F4;
|
2009-08-12 04:47:22 +08:00
|
|
|
else if (N->getValueType(0) == MVT::f64)
|
2014-10-23 00:58:20 +08:00
|
|
|
if (PPCSubTarget->hasVSX())
|
|
|
|
SelectCCOp = PPC::SELECT_CC_VSFRC;
|
|
|
|
else
|
|
|
|
SelectCCOp = PPC::SELECT_CC_F8;
|
[PowerPC] Add support for the QPX vector instruction set
This adds support for the QPX vector instruction set, which is used by the
enhanced A2 cores on the IBM BG/Q supercomputers. QPX vectors are 256 bytes
wide, holding 4 double-precision floating-point values. Boolean values, modeled
here as <4 x i1> are actually also represented as floating-point values
(essentially { -1, 1 } for { false, true }). QPX shares many features with
Altivec and VSX, but is distinct from both of them. One major difference is
that, instead of adding completely-separate vector registers, QPX vector
registers are extensions of the scalar floating-point registers (lane 0 is the
corresponding scalar floating-point value). The operations supported on QPX
vectors mirrors that supported on the scalar floating-point values (with some
additional ones for permutations and logical/comparison operations).
I've been maintaining this support out-of-tree, as part of the bgclang project,
for several years. This is not the entire bgclang patch set, but is most of the
subset that can be cleanly integrated into LLVM proper at this time. Adding
this to the LLVM backend is part of my efforts to rebase bgclang to the current
LLVM trunk, but is independently useful (especially for codes that use LLVM as
a JIT in library form).
The assembler/disassembler test coverage is complete. The CodeGen test coverage
is not, but I've included some tests, and more will be added as follow-up work.
llvm-svn: 230413
2015-02-25 09:06:45 +08:00
|
|
|
else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f64)
|
|
|
|
SelectCCOp = PPC::SELECT_CC_QFRC;
|
|
|
|
else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f32)
|
|
|
|
SelectCCOp = PPC::SELECT_CC_QSRC;
|
|
|
|
else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4i1)
|
|
|
|
SelectCCOp = PPC::SELECT_CC_QBRC;
|
2014-10-22 21:13:40 +08:00
|
|
|
else if (N->getValueType(0) == MVT::v2f64 ||
|
|
|
|
N->getValueType(0) == MVT::v2i64)
|
|
|
|
SelectCCOp = PPC::SELECT_CC_VSRC;
|
2006-04-09 06:45:08 +08:00
|
|
|
else
|
|
|
|
SelectCCOp = PPC::SELECT_CC_VRRC;
|
|
|
|
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { CCReg, N->getOperand(2), N->getOperand(3),
|
2015-04-28 22:05:47 +08:00
|
|
|
getI32Imm(BROpc, dl) };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops);
|
|
|
|
return;
|
2005-08-27 02:46:49 +08:00
|
|
|
}
|
2014-03-26 20:49:28 +08:00
|
|
|
case ISD::VSELECT:
|
2014-05-22 09:07:24 +08:00
|
|
|
if (PPCSubTarget->hasVSX()) {
|
2014-03-26 20:49:28 +08:00
|
|
|
SDValue Ops[] = { N->getOperand(2), N->getOperand(1), N->getOperand(0) };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops);
|
|
|
|
return;
|
2014-03-26 20:49:28 +08:00
|
|
|
}
|
|
|
|
|
2014-03-27 06:58:37 +08:00
|
|
|
break;
|
|
|
|
case ISD::VECTOR_SHUFFLE:
|
2014-05-22 09:07:24 +08:00
|
|
|
if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
|
2014-03-27 06:58:37 +08:00
|
|
|
N->getValueType(0) == MVT::v2i64)) {
|
|
|
|
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
|
2015-12-03 02:53:33 +08:00
|
|
|
|
2014-03-27 06:58:37 +08:00
|
|
|
SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1),
|
|
|
|
Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1);
|
|
|
|
unsigned DM[2];
|
|
|
|
|
|
|
|
for (int i = 0; i < 2; ++i)
|
|
|
|
if (SVN->getMaskElt(i) <= 0 || SVN->getMaskElt(i) == 2)
|
|
|
|
DM[i] = 0;
|
|
|
|
else
|
|
|
|
DM[i] = 1;
|
|
|
|
|
|
|
|
if (Op1 == Op2 && DM[0] == 0 && DM[1] == 0 &&
|
|
|
|
Op1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
|
|
|
|
isa<LoadSDNode>(Op1.getOperand(0))) {
|
|
|
|
LoadSDNode *LD = cast<LoadSDNode>(Op1.getOperand(0));
|
|
|
|
SDValue Base, Offset;
|
|
|
|
|
2015-11-02 22:01:11 +08:00
|
|
|
if (LD->isUnindexed() && LD->hasOneUse() && Op1.hasOneUse() &&
|
2015-10-15 04:45:00 +08:00
|
|
|
(LD->getMemoryVT() == MVT::f64 ||
|
|
|
|
LD->getMemoryVT() == MVT::i64) &&
|
2014-03-27 06:58:37 +08:00
|
|
|
SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) {
|
|
|
|
SDValue Chain = LD->getChain();
|
|
|
|
SDValue Ops[] = { Base, Offset, Chain };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::LXVDSX, N->getValueType(0), Ops);
|
|
|
|
return;
|
2014-03-27 06:58:37 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-02 03:40:07 +08:00
|
|
|
// For little endian, we must swap the input operands and adjust
|
|
|
|
// the mask elements (reverse and invert them).
|
|
|
|
if (PPCSubTarget->isLittleEndian()) {
|
|
|
|
std::swap(Op1, Op2);
|
|
|
|
unsigned tmp = DM[0];
|
|
|
|
DM[0] = 1 - DM[1];
|
|
|
|
DM[1] = 1 - tmp;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue DMV = CurDAG->getTargetConstant(DM[1] | (DM[0] << 1), dl,
|
|
|
|
MVT::i32);
|
2014-03-27 06:58:37 +08:00
|
|
|
SDValue Ops[] = { Op1, Op2, DMV };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::XXPERMDI, N->getValueType(0), Ops);
|
|
|
|
return;
|
2014-03-27 06:58:37 +08:00
|
|
|
}
|
|
|
|
|
2014-03-26 20:49:28 +08:00
|
|
|
break;
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
llvm-svn: 181927
2013-05-16 05:37:41 +08:00
|
|
|
case PPCISD::BDNZ:
|
|
|
|
case PPCISD::BDZ: {
|
2014-05-22 09:07:24 +08:00
|
|
|
bool IsPPC64 = PPCSubTarget->isPPC64();
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
llvm-svn: 181927
2013-05-16 05:37:41 +08:00
|
|
|
SDValue Ops[] = { N->getOperand(1), N->getOperand(0) };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ
|
|
|
|
? (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ)
|
|
|
|
: (IsPPC64 ? PPC::BDZ8 : PPC::BDZ),
|
|
|
|
MVT::Other, Ops);
|
|
|
|
return;
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
llvm-svn: 181927
2013-05-16 05:37:41 +08:00
|
|
|
}
|
2006-11-18 06:37:34 +08:00
|
|
|
case PPCISD::COND_BRANCH: {
|
2008-11-06 01:16:24 +08:00
|
|
|
// Op #0 is the Chain.
|
2006-11-18 06:37:34 +08:00
|
|
|
// Op #1 is the PPC::PRED_* number.
|
|
|
|
// Op #2 is the CR#
|
|
|
|
// Op #3 is the Dest MBB
|
Eliminate the ISel priority queue, which used the topological order for a
priority function. Instead, just iterate over the AllNodes list, which is
already in topological order. This eliminates a fair amount of bookkeeping,
and speeds up the isel phase by about 15% on many testcases.
The impact on most targets is that AddToISelQueue calls can be simply removed.
In the x86 target, there are two additional notable changes.
The rule-bending AND+SHIFT optimization in MatchAddress that creates new
pre-isel nodes during isel is now a little more verbose, but more robust.
Instead of either creating an invalid DAG or creating an invalid topological
sort, as it has historically done, it can now just insert the new nodes into
the node list at a position where they will be consistent with the topological
ordering.
Also, the address-matching code has logic that checked to see if a node was
"already selected". However, when a node is selected, it has all its uses
taken away via ReplaceAllUsesWith or equivalent, so it won't recieve any
further visits from MatchAddress. This code is now removed.
llvm-svn: 58748
2008-11-05 12:14:16 +08:00
|
|
|
// Op #4 is the Flag.
|
2007-06-29 09:25:06 +08:00
|
|
|
// Prevent PPC::PRED_* from being selected into LI.
|
2015-12-12 08:32:00 +08:00
|
|
|
unsigned PCC = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
|
|
|
|
if (EnableBranchHint)
|
|
|
|
PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(3));
|
|
|
|
|
|
|
|
SDValue Pred = getI32Imm(PCC, dl);
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3),
|
2006-11-18 06:37:34 +08:00
|
|
|
N->getOperand(0), N->getOperand(4) };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
|
|
|
|
return;
|
2006-11-18 06:37:34 +08:00
|
|
|
}
|
2006-03-17 09:40:33 +08:00
|
|
|
case ISD::BR_CC: {
|
2005-08-22 02:50:37 +08:00
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
unsigned PCC = getPredicateForSetCC(CC);
|
|
|
|
|
|
|
|
if (N->getOperand(2).getValueType() == MVT::i1) {
|
|
|
|
unsigned Opc;
|
|
|
|
bool Swap;
|
|
|
|
switch (PCC) {
|
|
|
|
default: llvm_unreachable("Unexpected Boolean-operand predicate");
|
|
|
|
case PPC::PRED_LT: Opc = PPC::CRANDC; Swap = true; break;
|
|
|
|
case PPC::PRED_LE: Opc = PPC::CRORC; Swap = true; break;
|
|
|
|
case PPC::PRED_EQ: Opc = PPC::CREQV; Swap = false; break;
|
|
|
|
case PPC::PRED_GE: Opc = PPC::CRORC; Swap = false; break;
|
|
|
|
case PPC::PRED_GT: Opc = PPC::CRANDC; Swap = false; break;
|
|
|
|
case PPC::PRED_NE: Opc = PPC::CRXOR; Swap = false; break;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue BitComp(CurDAG->getMachineNode(Opc, dl, MVT::i1,
|
|
|
|
N->getOperand(Swap ? 3 : 2),
|
|
|
|
N->getOperand(Swap ? 2 : 3)), 0);
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::BC, MVT::Other, BitComp, N->getOperand(4),
|
|
|
|
N->getOperand(0));
|
|
|
|
return;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
}
|
|
|
|
|
2015-12-12 08:32:00 +08:00
|
|
|
if (EnableBranchHint)
|
|
|
|
PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(4));
|
|
|
|
|
2009-02-07 03:16:40 +08:00
|
|
|
SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Ops[] = { getI32Imm(PCC, dl), CondCode,
|
2006-08-27 16:14:06 +08:00
|
|
|
N->getOperand(4), N->getOperand(0) };
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
|
|
|
|
return;
|
2005-08-22 02:50:37 +08:00
|
|
|
}
|
2006-04-23 02:53:45 +08:00
|
|
|
case ISD::BRIND: {
|
2006-06-10 09:15:02 +08:00
|
|
|
// FIXME: Should custom lower this.
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Chain = N->getOperand(0);
|
|
|
|
SDValue Target = N->getOperand(1);
|
2009-08-12 04:47:22 +08:00
|
|
|
unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8;
|
2011-06-03 23:47:49 +08:00
|
|
|
unsigned Reg = Target.getValueType() == MVT::i32 ? PPC::BCTR : PPC::BCTR8;
|
2011-12-08 12:36:44 +08:00
|
|
|
Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Target,
|
2009-09-26 02:54:59 +08:00
|
|
|
Chain), 0);
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
|
|
|
|
return;
|
2006-04-23 02:53:45 +08:00
|
|
|
}
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
case PPCISD::TOC_ENTRY: {
|
2014-11-12 23:16:30 +08:00
|
|
|
assert ((PPCSubTarget->isPPC64() || PPCSubTarget->isSVR4ABI()) &&
|
|
|
|
"Only supported for 64-bit ABI and 32-bit SVR4");
|
2014-07-19 07:29:49 +08:00
|
|
|
if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) {
|
|
|
|
SDValue GA = N->getOperand(0);
|
2016-05-21 05:43:23 +08:00
|
|
|
SDNode *MN = CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA,
|
|
|
|
N->getOperand(1));
|
|
|
|
transferMemOperands(N, MN);
|
|
|
|
ReplaceNode(N, MN);
|
|
|
|
return;
|
2014-08-28 12:40:55 +08:00
|
|
|
}
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
2013-02-22 01:12:27 +08:00
|
|
|
// For medium and large code model, we generate two instructions as
|
|
|
|
// described below. Otherwise we allow SelectCodeCommon to handle this,
|
2014-10-31 18:33:14 +08:00
|
|
|
// selecting one of LDtoc, LDtocJTI, LDtocCPT, and LDtocBA.
|
2013-02-22 01:12:27 +08:00
|
|
|
CodeModel::Model CModel = TM.getCodeModel();
|
|
|
|
if (CModel != CodeModel::Medium && CModel != CodeModel::Large)
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
break;
|
|
|
|
|
2014-06-17 05:36:02 +08:00
|
|
|
// The first source operand is a TargetGlobalAddress or a TargetJumpTable.
|
2015-11-21 04:51:31 +08:00
|
|
|
// If it must be toc-referenced according to PPCSubTarget, we generate:
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
// LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>))
|
|
|
|
// Otherwise we generate:
|
|
|
|
// ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>)
|
|
|
|
SDValue GA = N->getOperand(0);
|
|
|
|
SDValue TOCbase = N->getOperand(1);
|
|
|
|
SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
|
[PowerPC] Make LDtocL and friends invariant loads
LDtocL, and other loads that roughly correspond to the TOC_ENTRY SDAG node,
represent loads from the TOC, which is invariant. As a result, these loads can
be hoisted out of loops, etc. In order to do this, we need to generate
GOT-style MMOs for TOC_ENTRY, which requires treating it as a legitimate memory
intrinsic node type. Once this is done, the MMO transfer is automatically
handled for TableGen-driven instruction selection, and for nodes generated
directly in PPCISelDAGToDAG, we need to transfer the MMOs manually.
Also, we were not transferring MMOs associated with pre-increment loads, so do
that too.
Lastly, this fixes an exposed bug where R30 was not added as a defined operand of
UpdateGBR.
This problem was highlighted by an example (used to generate the test case)
posted to llvmdev by Francois Pichet.
llvm-svn: 230553
2015-02-26 05:36:59 +08:00
|
|
|
TOCbase, GA);
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
2014-10-31 18:33:14 +08:00
|
|
|
if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA) ||
|
2016-05-21 05:43:23 +08:00
|
|
|
CModel == CodeModel::Large) {
|
|
|
|
SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
|
|
|
|
SDValue(Tmp, 0));
|
|
|
|
transferMemOperands(N, MN);
|
|
|
|
ReplaceNode(N, MN);
|
|
|
|
return;
|
|
|
|
}
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
|
|
|
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
|
2015-11-21 04:51:31 +08:00
|
|
|
const GlobalValue *GV = G->getGlobal();
|
|
|
|
unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV);
|
|
|
|
if (GVFlags & PPCII::MO_NLP_FLAG) {
|
2016-05-21 05:43:23 +08:00
|
|
|
SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
|
|
|
|
SDValue(Tmp, 0));
|
|
|
|
transferMemOperands(N, MN);
|
|
|
|
ReplaceNode(N, MN);
|
|
|
|
return;
|
2015-11-21 04:51:31 +08:00
|
|
|
}
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
}
|
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
|
|
|
|
SDValue(Tmp, 0), GA));
|
|
|
|
return;
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
}
|
2014-07-26 01:47:22 +08:00
|
|
|
case PPCISD::PPC32_PICGOT: {
|
|
|
|
// Generate a PIC-safe GOT reference.
|
|
|
|
assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() &&
|
|
|
|
"PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
|
2016-05-21 05:43:23 +08:00
|
|
|
CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT,
|
|
|
|
PPCLowering->getPointerTy(CurDAG->getDataLayout()),
|
|
|
|
MVT::i32);
|
|
|
|
return;
|
2014-07-26 01:47:22 +08:00
|
|
|
}
|
2013-02-20 23:50:31 +08:00
|
|
|
case PPCISD::VADD_SPLAT: {
|
2013-02-21 04:41:42 +08:00
|
|
|
// This expands into one of three sequences, depending on whether
|
|
|
|
// the first operand is odd or even, positive or negative.
|
2013-02-20 23:50:31 +08:00
|
|
|
assert(isa<ConstantSDNode>(N->getOperand(0)) &&
|
|
|
|
isa<ConstantSDNode>(N->getOperand(1)) &&
|
|
|
|
"Invalid operand on VADD_SPLAT!");
|
2013-02-21 04:41:42 +08:00
|
|
|
|
|
|
|
int Elt = N->getConstantOperandVal(0);
|
2013-02-20 23:50:31 +08:00
|
|
|
int EltSize = N->getConstantOperandVal(1);
|
2013-02-21 04:41:42 +08:00
|
|
|
unsigned Opc1, Opc2, Opc3;
|
2013-02-20 23:50:31 +08:00
|
|
|
EVT VT;
|
2013-02-21 04:41:42 +08:00
|
|
|
|
2013-02-20 23:50:31 +08:00
|
|
|
if (EltSize == 1) {
|
|
|
|
Opc1 = PPC::VSPLTISB;
|
|
|
|
Opc2 = PPC::VADDUBM;
|
2013-02-21 04:41:42 +08:00
|
|
|
Opc3 = PPC::VSUBUBM;
|
2013-02-20 23:50:31 +08:00
|
|
|
VT = MVT::v16i8;
|
|
|
|
} else if (EltSize == 2) {
|
|
|
|
Opc1 = PPC::VSPLTISH;
|
|
|
|
Opc2 = PPC::VADDUHM;
|
2013-02-21 04:41:42 +08:00
|
|
|
Opc3 = PPC::VSUBUHM;
|
2013-02-20 23:50:31 +08:00
|
|
|
VT = MVT::v8i16;
|
|
|
|
} else {
|
|
|
|
assert(EltSize == 4 && "Invalid element size on VADD_SPLAT!");
|
|
|
|
Opc1 = PPC::VSPLTISW;
|
|
|
|
Opc2 = PPC::VADDUWM;
|
2013-02-21 04:41:42 +08:00
|
|
|
Opc3 = PPC::VSUBUWM;
|
2013-02-20 23:50:31 +08:00
|
|
|
VT = MVT::v4i32;
|
|
|
|
}
|
2013-02-21 04:41:42 +08:00
|
|
|
|
|
|
|
if ((Elt & 1) == 0) {
|
|
|
|
// Elt is even, in the range [-32,-18] + [16,30].
|
|
|
|
//
|
|
|
|
// Convert: VADD_SPLAT elt, size
|
|
|
|
// Into: tmp = VSPLTIS[BHW] elt
|
|
|
|
// VADDU[BHW]M tmp, tmp
|
|
|
|
// Where: [BHW] = B for size = 1, H for size = 2, W for size = 4
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue EltVal = getI32Imm(Elt >> 1, dl);
|
2013-02-21 04:41:42 +08:00
|
|
|
SDNode *Tmp = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
|
|
|
|
SDValue TmpVal = SDValue(Tmp, 0);
|
2016-05-21 05:43:23 +08:00
|
|
|
ReplaceNode(N, CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal));
|
|
|
|
return;
|
2013-02-21 04:41:42 +08:00
|
|
|
|
|
|
|
} else if (Elt > 0) {
|
|
|
|
// Elt is odd and positive, in the range [17,31].
|
|
|
|
//
|
|
|
|
// Convert: VADD_SPLAT elt, size
|
|
|
|
// Into: tmp1 = VSPLTIS[BHW] elt-16
|
|
|
|
// tmp2 = VSPLTIS[BHW] -16
|
|
|
|
// VSUBU[BHW]M tmp1, tmp2
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue EltVal = getI32Imm(Elt - 16, dl);
|
2013-02-21 04:41:42 +08:00
|
|
|
SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
|
2015-04-28 22:05:47 +08:00
|
|
|
EltVal = getI32Imm(-16, dl);
|
2013-02-21 04:41:42 +08:00
|
|
|
SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
|
2016-05-21 05:43:23 +08:00
|
|
|
ReplaceNode(N, CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0),
|
|
|
|
SDValue(Tmp2, 0)));
|
|
|
|
return;
|
2013-02-21 04:41:42 +08:00
|
|
|
|
|
|
|
} else {
|
|
|
|
// Elt is odd and negative, in the range [-31,-17].
|
|
|
|
//
|
|
|
|
// Convert: VADD_SPLAT elt, size
|
|
|
|
// Into: tmp1 = VSPLTIS[BHW] elt+16
|
|
|
|
// tmp2 = VSPLTIS[BHW] -16
|
|
|
|
// VADDU[BHW]M tmp1, tmp2
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue EltVal = getI32Imm(Elt + 16, dl);
|
2013-02-21 04:41:42 +08:00
|
|
|
SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
|
2015-04-28 22:05:47 +08:00
|
|
|
EltVal = getI32Imm(-16, dl);
|
2013-02-21 04:41:42 +08:00
|
|
|
SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
|
2016-05-21 05:43:23 +08:00
|
|
|
ReplaceNode(N, CurDAG->getMachineNode(Opc2, dl, VT, SDValue(Tmp1, 0),
|
|
|
|
SDValue(Tmp2, 0)));
|
|
|
|
return;
|
2013-02-21 04:41:42 +08:00
|
|
|
}
|
2013-02-20 23:50:31 +08:00
|
|
|
}
|
2005-08-18 03:33:03 +08:00
|
|
|
}
|
2010-12-24 12:28:06 +08:00
|
|
|
|
2016-05-21 05:43:23 +08:00
|
|
|
SelectCode(N);
|
2005-08-18 03:33:03 +08:00
|
|
|
}
|
|
|
|
|
2015-01-03 09:16:37 +08:00
|
|
|
// If the target supports the cmpb instruction, do the idiom recognition here.
|
|
|
|
// We don't do this as a DAG combine because we don't want to do it as nodes
|
|
|
|
// are being combined (because we might miss part of the eventual idiom). We
|
|
|
|
// don't want to do it during instruction selection because we want to reuse
|
|
|
|
// the logic for lowering the masking operations already part of the
|
|
|
|
// instruction selector.
|
|
|
|
SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) {
|
|
|
|
SDLoc dl(N);
|
|
|
|
|
|
|
|
assert(N->getOpcode() == ISD::OR &&
|
|
|
|
"Only OR nodes are supported for CMPB");
|
|
|
|
|
|
|
|
SDValue Res;
|
|
|
|
if (!PPCSubTarget->hasCMPB())
|
|
|
|
return Res;
|
|
|
|
|
|
|
|
if (N->getValueType(0) != MVT::i32 &&
|
|
|
|
N->getValueType(0) != MVT::i64)
|
|
|
|
return Res;
|
|
|
|
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
|
|
|
|
SDValue RHS, LHS;
|
|
|
|
bool BytesFound[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
|
|
uint64_t Mask = 0, Alt = 0;
|
|
|
|
|
|
|
|
auto IsByteSelectCC = [this](SDValue O, unsigned &b,
|
|
|
|
uint64_t &Mask, uint64_t &Alt,
|
|
|
|
SDValue &LHS, SDValue &RHS) {
|
|
|
|
if (O.getOpcode() != ISD::SELECT_CC)
|
|
|
|
return false;
|
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(O.getOperand(4))->get();
|
|
|
|
|
|
|
|
if (!isa<ConstantSDNode>(O.getOperand(2)) ||
|
|
|
|
!isa<ConstantSDNode>(O.getOperand(3)))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
uint64_t PM = O.getConstantOperandVal(2);
|
|
|
|
uint64_t PAlt = O.getConstantOperandVal(3);
|
|
|
|
for (b = 0; b < 8; ++b) {
|
|
|
|
uint64_t Mask = UINT64_C(0xFF) << (8*b);
|
|
|
|
if (PM && (PM & Mask) == PM && (PAlt & Mask) == PAlt)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (b == 8)
|
|
|
|
return false;
|
|
|
|
Mask |= PM;
|
|
|
|
Alt |= PAlt;
|
|
|
|
|
|
|
|
if (!isa<ConstantSDNode>(O.getOperand(1)) ||
|
|
|
|
O.getConstantOperandVal(1) != 0) {
|
|
|
|
SDValue Op0 = O.getOperand(0), Op1 = O.getOperand(1);
|
|
|
|
if (Op0.getOpcode() == ISD::TRUNCATE)
|
|
|
|
Op0 = Op0.getOperand(0);
|
|
|
|
if (Op1.getOpcode() == ISD::TRUNCATE)
|
|
|
|
Op1 = Op1.getOperand(0);
|
|
|
|
|
|
|
|
if (Op0.getOpcode() == ISD::SRL && Op1.getOpcode() == ISD::SRL &&
|
|
|
|
Op0.getOperand(1) == Op1.getOperand(1) && CC == ISD::SETEQ &&
|
|
|
|
isa<ConstantSDNode>(Op0.getOperand(1))) {
|
|
|
|
|
|
|
|
unsigned Bits = Op0.getValueType().getSizeInBits();
|
|
|
|
if (b != Bits/8-1)
|
|
|
|
return false;
|
|
|
|
if (Op0.getConstantOperandVal(1) != Bits-8)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
LHS = Op0.getOperand(0);
|
|
|
|
RHS = Op1.getOperand(0);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// When we have small integers (i16 to be specific), the form present
|
|
|
|
// post-legalization uses SETULT in the SELECT_CC for the
|
|
|
|
// higher-order byte, depending on the fact that the
|
|
|
|
// even-higher-order bytes are known to all be zero, for example:
|
|
|
|
// select_cc (xor $lhs, $rhs), 256, 65280, 0, setult
|
|
|
|
// (so when the second byte is the same, because all higher-order
|
|
|
|
// bits from bytes 3 and 4 are known to be zero, the result of the
|
|
|
|
// xor can be at most 255)
|
|
|
|
if (Op0.getOpcode() == ISD::XOR && CC == ISD::SETULT &&
|
|
|
|
isa<ConstantSDNode>(O.getOperand(1))) {
|
|
|
|
|
|
|
|
uint64_t ULim = O.getConstantOperandVal(1);
|
|
|
|
if (ULim != (UINT64_C(1) << b*8))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Now we need to make sure that the upper bytes are known to be
|
|
|
|
// zero.
|
|
|
|
unsigned Bits = Op0.getValueType().getSizeInBits();
|
|
|
|
if (!CurDAG->MaskedValueIsZero(Op0,
|
|
|
|
APInt::getHighBitsSet(Bits, Bits - (b+1)*8)))
|
|
|
|
return false;
|
2015-12-03 02:53:33 +08:00
|
|
|
|
2015-01-03 09:16:37 +08:00
|
|
|
LHS = Op0.getOperand(0);
|
|
|
|
RHS = Op0.getOperand(1);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (CC != ISD::SETEQ)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
SDValue Op = O.getOperand(0);
|
|
|
|
if (Op.getOpcode() == ISD::AND) {
|
|
|
|
if (!isa<ConstantSDNode>(Op.getOperand(1)))
|
|
|
|
return false;
|
|
|
|
if (Op.getConstantOperandVal(1) != (UINT64_C(0xFF) << (8*b)))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
SDValue XOR = Op.getOperand(0);
|
|
|
|
if (XOR.getOpcode() == ISD::TRUNCATE)
|
|
|
|
XOR = XOR.getOperand(0);
|
|
|
|
if (XOR.getOpcode() != ISD::XOR)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
LHS = XOR.getOperand(0);
|
|
|
|
RHS = XOR.getOperand(1);
|
|
|
|
return true;
|
|
|
|
} else if (Op.getOpcode() == ISD::SRL) {
|
|
|
|
if (!isa<ConstantSDNode>(Op.getOperand(1)))
|
|
|
|
return false;
|
|
|
|
unsigned Bits = Op.getValueType().getSizeInBits();
|
|
|
|
if (b != Bits/8-1)
|
|
|
|
return false;
|
|
|
|
if (Op.getConstantOperandVal(1) != Bits-8)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
SDValue XOR = Op.getOperand(0);
|
|
|
|
if (XOR.getOpcode() == ISD::TRUNCATE)
|
|
|
|
XOR = XOR.getOperand(0);
|
|
|
|
if (XOR.getOpcode() != ISD::XOR)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
LHS = XOR.getOperand(0);
|
|
|
|
RHS = XOR.getOperand(1);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
|
|
|
|
SmallVector<SDValue, 8> Queue(1, SDValue(N, 0));
|
|
|
|
while (!Queue.empty()) {
|
|
|
|
SDValue V = Queue.pop_back_val();
|
|
|
|
|
|
|
|
for (const SDValue &O : V.getNode()->ops()) {
|
|
|
|
unsigned b;
|
|
|
|
uint64_t M = 0, A = 0;
|
|
|
|
SDValue OLHS, ORHS;
|
|
|
|
if (O.getOpcode() == ISD::OR) {
|
|
|
|
Queue.push_back(O);
|
|
|
|
} else if (IsByteSelectCC(O, b, M, A, OLHS, ORHS)) {
|
|
|
|
if (!LHS) {
|
|
|
|
LHS = OLHS;
|
|
|
|
RHS = ORHS;
|
|
|
|
BytesFound[b] = true;
|
|
|
|
Mask |= M;
|
|
|
|
Alt |= A;
|
|
|
|
} else if ((LHS == ORHS && RHS == OLHS) ||
|
|
|
|
(RHS == ORHS && LHS == OLHS)) {
|
|
|
|
BytesFound[b] = true;
|
|
|
|
Mask |= M;
|
|
|
|
Alt |= A;
|
|
|
|
} else {
|
|
|
|
return Res;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return Res;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned LastB = 0, BCnt = 0;
|
|
|
|
for (unsigned i = 0; i < 8; ++i)
|
|
|
|
if (BytesFound[LastB]) {
|
|
|
|
++BCnt;
|
|
|
|
LastB = i;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!LastB || BCnt < 2)
|
|
|
|
return Res;
|
|
|
|
|
|
|
|
// Because we'll be zero-extending the output anyway if don't have a specific
|
|
|
|
// value for each input byte (via the Mask), we can 'anyext' the inputs.
|
|
|
|
if (LHS.getValueType() != VT) {
|
|
|
|
LHS = CurDAG->getAnyExtOrTrunc(LHS, dl, VT);
|
|
|
|
RHS = CurDAG->getAnyExtOrTrunc(RHS, dl, VT);
|
|
|
|
}
|
|
|
|
|
|
|
|
Res = CurDAG->getNode(PPCISD::CMPB, dl, VT, LHS, RHS);
|
|
|
|
|
|
|
|
bool NonTrivialMask = ((int64_t) Mask) != INT64_C(-1);
|
|
|
|
if (NonTrivialMask && !Alt) {
|
|
|
|
// Res = Mask & CMPB
|
2015-04-28 22:05:47 +08:00
|
|
|
Res = CurDAG->getNode(ISD::AND, dl, VT, Res,
|
|
|
|
CurDAG->getConstant(Mask, dl, VT));
|
2015-01-03 09:16:37 +08:00
|
|
|
} else if (Alt) {
|
|
|
|
// Res = (CMPB & Mask) | (~CMPB & Alt)
|
|
|
|
// Which, as suggested here:
|
|
|
|
// https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge
|
|
|
|
// can be written as:
|
|
|
|
// Res = Alt ^ ((Alt ^ Mask) & CMPB)
|
|
|
|
// useful because the (Alt ^ Mask) can be pre-computed.
|
|
|
|
Res = CurDAG->getNode(ISD::AND, dl, VT, Res,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getConstant(Mask ^ Alt, dl, VT));
|
|
|
|
Res = CurDAG->getNode(ISD::XOR, dl, VT, Res,
|
|
|
|
CurDAG->getConstant(Alt, dl, VT));
|
2015-01-03 09:16:37 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return Res;
|
|
|
|
}
|
|
|
|
|
[PowerPC] Fold i1 extensions with other ops
Consider this function from our README.txt file:
int foo(int a, int b) { return (a < b) << 4; }
We now explicitly track CR bits by default, so the comment in the README.txt
about not really having a SETCC is no longer accurate, but we did generate this
somewhat silly code:
cmpw 0, 3, 4
li 3, 0
li 12, 1
isel 3, 12, 3, 0
sldi 3, 3, 4
blr
which generates the zext as a select between 0 and 1, and then shifts the
result by a constant amount. Here we preprocess the DAG in order to fold the
results of operations on an extension of an i1 value into the SELECT_I[48]
pseudo instruction when the resulting constant can be materialized using one
instruction (just like the 0 and 1). This was not implemented as a DAGCombine
because the resulting code would have been anti-canonical and depends on
replacing chained user nodes, which does not fit well into the lowering
paradigm. Now we generate:
cmpw 0, 3, 4
li 3, 0
li 12, 16
isel 3, 12, 3, 0
blr
which is less silly.
llvm-svn: 225203
2015-01-06 05:10:24 +08:00
|
|
|
// When CR bit registers are enabled, an extension of an i1 variable to a i32
|
|
|
|
// or i64 value is lowered in terms of a SELECT_I[48] operation, and thus
|
|
|
|
// involves constant materialization of a 0 or a 1 or both. If the result of
|
|
|
|
// the extension is then operated upon by some operator that can be constant
|
|
|
|
// folded with a constant 0 or 1, and that constant can be materialized using
|
|
|
|
// only one instruction (like a zero or one), then we should fold in those
|
|
|
|
// operations with the select.
|
|
|
|
void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) {
|
|
|
|
if (!PPCSubTarget->useCRBits())
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (N->getOpcode() != ISD::ZERO_EXTEND &&
|
|
|
|
N->getOpcode() != ISD::SIGN_EXTEND &&
|
|
|
|
N->getOpcode() != ISD::ANY_EXTEND)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (N->getOperand(0).getValueType() != MVT::i1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!N->hasOneUse())
|
|
|
|
return;
|
|
|
|
|
|
|
|
SDLoc dl(N);
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
SDValue Cond = N->getOperand(0);
|
|
|
|
SDValue ConstTrue =
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getConstant(N->getOpcode() == ISD::SIGN_EXTEND ? -1 : 1, dl, VT);
|
|
|
|
SDValue ConstFalse = CurDAG->getConstant(0, dl, VT);
|
[PowerPC] Fold i1 extensions with other ops
Consider this function from our README.txt file:
int foo(int a, int b) { return (a < b) << 4; }
We now explicitly track CR bits by default, so the comment in the README.txt
about not really having a SETCC is no longer accurate, but we did generate this
somewhat silly code:
cmpw 0, 3, 4
li 3, 0
li 12, 1
isel 3, 12, 3, 0
sldi 3, 3, 4
blr
which generates the zext as a select between 0 and 1, and then shifts the
result by a constant amount. Here we preprocess the DAG in order to fold the
results of operations on an extension of an i1 value into the SELECT_I[48]
pseudo instruction when the resulting constant can be materialized using one
instruction (just like the 0 and 1). This was not implemented as a DAGCombine
because the resulting code would have been anti-canonical and depends on
replacing chained user nodes, which does not fit well into the lowering
paradigm. Now we generate:
cmpw 0, 3, 4
li 3, 0
li 12, 16
isel 3, 12, 3, 0
blr
which is less silly.
llvm-svn: 225203
2015-01-06 05:10:24 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
SDNode *User = *N->use_begin();
|
|
|
|
if (User->getNumOperands() != 2)
|
|
|
|
break;
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
auto TryFold = [this, N, User, dl](SDValue Val) {
|
[PowerPC] Fold i1 extensions with other ops
Consider this function from our README.txt file:
int foo(int a, int b) { return (a < b) << 4; }
We now explicitly track CR bits by default, so the comment in the README.txt
about not really having a SETCC is no longer accurate, but we did generate this
somewhat silly code:
cmpw 0, 3, 4
li 3, 0
li 12, 1
isel 3, 12, 3, 0
sldi 3, 3, 4
blr
which generates the zext as a select between 0 and 1, and then shifts the
result by a constant amount. Here we preprocess the DAG in order to fold the
results of operations on an extension of an i1 value into the SELECT_I[48]
pseudo instruction when the resulting constant can be materialized using one
instruction (just like the 0 and 1). This was not implemented as a DAGCombine
because the resulting code would have been anti-canonical and depends on
replacing chained user nodes, which does not fit well into the lowering
paradigm. Now we generate:
cmpw 0, 3, 4
li 3, 0
li 12, 16
isel 3, 12, 3, 0
blr
which is less silly.
llvm-svn: 225203
2015-01-06 05:10:24 +08:00
|
|
|
SDValue UserO0 = User->getOperand(0), UserO1 = User->getOperand(1);
|
|
|
|
SDValue O0 = UserO0.getNode() == N ? Val : UserO0;
|
|
|
|
SDValue O1 = UserO1.getNode() == N ? Val : UserO1;
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
return CurDAG->FoldConstantArithmetic(User->getOpcode(), dl,
|
[PowerPC] Fold i1 extensions with other ops
Consider this function from our README.txt file:
int foo(int a, int b) { return (a < b) << 4; }
We now explicitly track CR bits by default, so the comment in the README.txt
about not really having a SETCC is no longer accurate, but we did generate this
somewhat silly code:
cmpw 0, 3, 4
li 3, 0
li 12, 1
isel 3, 12, 3, 0
sldi 3, 3, 4
blr
which generates the zext as a select between 0 and 1, and then shifts the
result by a constant amount. Here we preprocess the DAG in order to fold the
results of operations on an extension of an i1 value into the SELECT_I[48]
pseudo instruction when the resulting constant can be materialized using one
instruction (just like the 0 and 1). This was not implemented as a DAGCombine
because the resulting code would have been anti-canonical and depends on
replacing chained user nodes, which does not fit well into the lowering
paradigm. Now we generate:
cmpw 0, 3, 4
li 3, 0
li 12, 16
isel 3, 12, 3, 0
blr
which is less silly.
llvm-svn: 225203
2015-01-06 05:10:24 +08:00
|
|
|
User->getValueType(0),
|
|
|
|
O0.getNode(), O1.getNode());
|
|
|
|
};
|
|
|
|
|
|
|
|
SDValue TrueRes = TryFold(ConstTrue);
|
|
|
|
if (!TrueRes)
|
|
|
|
break;
|
|
|
|
SDValue FalseRes = TryFold(ConstFalse);
|
|
|
|
if (!FalseRes)
|
|
|
|
break;
|
|
|
|
|
|
|
|
// For us to materialize these using one instruction, we must be able to
|
|
|
|
// represent them as signed 16-bit integers.
|
|
|
|
uint64_t True = cast<ConstantSDNode>(TrueRes)->getZExtValue(),
|
|
|
|
False = cast<ConstantSDNode>(FalseRes)->getZExtValue();
|
|
|
|
if (!isInt<16>(True) || !isInt<16>(False))
|
|
|
|
break;
|
|
|
|
|
|
|
|
// We can replace User with a new SELECT node, and try again to see if we
|
|
|
|
// can fold the select with its user.
|
|
|
|
Res = CurDAG->getSelect(dl, User->getValueType(0), Cond, TrueRes, FalseRes);
|
|
|
|
N = User;
|
|
|
|
ConstTrue = TrueRes;
|
|
|
|
ConstFalse = FalseRes;
|
|
|
|
} while (N->hasOneUse());
|
|
|
|
}
|
|
|
|
|
2015-01-03 09:16:37 +08:00
|
|
|
void PPCDAGToDAGISel::PreprocessISelDAG() {
|
|
|
|
SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
|
|
|
|
++Position;
|
|
|
|
|
|
|
|
bool MadeChange = false;
|
|
|
|
while (Position != CurDAG->allnodes_begin()) {
|
2015-10-20 09:07:37 +08:00
|
|
|
SDNode *N = &*--Position;
|
2015-01-03 09:16:37 +08:00
|
|
|
if (N->use_empty())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
SDValue Res;
|
|
|
|
switch (N->getOpcode()) {
|
|
|
|
default: break;
|
|
|
|
case ISD::OR:
|
|
|
|
Res = combineToCMPB(N);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
[PowerPC] Fold i1 extensions with other ops
Consider this function from our README.txt file:
int foo(int a, int b) { return (a < b) << 4; }
We now explicitly track CR bits by default, so the comment in the README.txt
about not really having a SETCC is no longer accurate, but we did generate this
somewhat silly code:
cmpw 0, 3, 4
li 3, 0
li 12, 1
isel 3, 12, 3, 0
sldi 3, 3, 4
blr
which generates the zext as a select between 0 and 1, and then shifts the
result by a constant amount. Here we preprocess the DAG in order to fold the
results of operations on an extension of an i1 value into the SELECT_I[48]
pseudo instruction when the resulting constant can be materialized using one
instruction (just like the 0 and 1). This was not implemented as a DAGCombine
because the resulting code would have been anti-canonical and depends on
replacing chained user nodes, which does not fit well into the lowering
paradigm. Now we generate:
cmpw 0, 3, 4
li 3, 0
li 12, 16
isel 3, 12, 3, 0
blr
which is less silly.
llvm-svn: 225203
2015-01-06 05:10:24 +08:00
|
|
|
if (!Res)
|
|
|
|
foldBoolExts(Res, N);
|
|
|
|
|
2015-01-03 09:16:37 +08:00
|
|
|
if (Res) {
|
|
|
|
DEBUG(dbgs() << "PPC DAG preprocessing replacing:\nOld: ");
|
|
|
|
DEBUG(N->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\nNew: ");
|
|
|
|
DEBUG(Res.getNode()->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\n");
|
|
|
|
|
|
|
|
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
|
|
|
|
MadeChange = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (MadeChange)
|
|
|
|
CurDAG->RemoveDeadNodes();
|
|
|
|
}
|
|
|
|
|
2014-01-03 06:09:39 +08:00
|
|
|
/// PostprocessISelDAG - Perform some late peephole optimizations
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
/// on the DAG representation.
|
|
|
|
void PPCDAGToDAGISel::PostprocessISelDAG() {
|
|
|
|
|
|
|
|
// Skip peepholes at -O0.
|
|
|
|
if (TM.getOptLevel() == CodeGenOpt::None)
|
|
|
|
return;
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
PeepholePPC64();
|
2014-05-14 08:31:15 +08:00
|
|
|
PeepholeCROps();
|
2014-12-13 07:59:36 +08:00
|
|
|
PeepholePPC64ZExt();
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
}
|
|
|
|
|
2014-02-28 14:11:16 +08:00
|
|
|
// Check if all users of this node will become isel where the second operand
|
|
|
|
// is the constant zero. If this is so, and if we can negate the condition,
|
|
|
|
// then we can flip the true and false operands. This will allow the zero to
|
|
|
|
// be folded with the isel so that we don't need to materialize a register
|
|
|
|
// containing zero.
|
|
|
|
bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
|
|
|
|
// If we're not using isel, then this does not matter.
|
2014-05-22 09:07:24 +08:00
|
|
|
if (!PPCSubTarget->hasISEL())
|
2014-02-28 14:11:16 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
|
|
|
|
UI != UE; ++UI) {
|
|
|
|
SDNode *User = *UI;
|
|
|
|
if (!User->isMachineOpcode())
|
|
|
|
return false;
|
|
|
|
if (User->getMachineOpcode() != PPC::SELECT_I4 &&
|
|
|
|
User->getMachineOpcode() != PPC::SELECT_I8)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
SDNode *Op2 = User->getOperand(2).getNode();
|
|
|
|
if (!Op2->isMachineOpcode())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (Op2->getMachineOpcode() != PPC::LI &&
|
|
|
|
Op2->getMachineOpcode() != PPC::LI8)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op2->getOperand(0));
|
|
|
|
if (!C)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!C->isNullValue())
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) {
|
|
|
|
SmallVector<SDNode *, 4> ToReplace;
|
|
|
|
for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
|
|
|
|
UI != UE; ++UI) {
|
|
|
|
SDNode *User = *UI;
|
|
|
|
assert((User->getMachineOpcode() == PPC::SELECT_I4 ||
|
|
|
|
User->getMachineOpcode() == PPC::SELECT_I8) &&
|
|
|
|
"Must have all select users");
|
|
|
|
ToReplace.push_back(User);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (SmallVector<SDNode *, 4>::iterator UI = ToReplace.begin(),
|
|
|
|
UE = ToReplace.end(); UI != UE; ++UI) {
|
|
|
|
SDNode *User = *UI;
|
|
|
|
SDNode *ResNode =
|
|
|
|
CurDAG->getMachineNode(User->getMachineOpcode(), SDLoc(User),
|
|
|
|
User->getValueType(0), User->getOperand(0),
|
|
|
|
User->getOperand(2),
|
|
|
|
User->getOperand(1));
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "CR Peephole replacing:\nOld: ");
|
|
|
|
DEBUG(User->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\nNew: ");
|
|
|
|
DEBUG(ResNode->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\n");
|
|
|
|
|
|
|
|
ReplaceUses(User, ResNode);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-05-14 08:31:15 +08:00
|
|
|
void PPCDAGToDAGISel::PeepholeCROps() {
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
bool IsModified;
|
|
|
|
do {
|
|
|
|
IsModified = false;
|
2015-07-15 06:10:54 +08:00
|
|
|
for (SDNode &Node : CurDAG->allnodes()) {
|
|
|
|
MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
if (!MachineNode || MachineNode->use_empty())
|
|
|
|
continue;
|
|
|
|
SDNode *ResNode = MachineNode;
|
|
|
|
|
|
|
|
bool Op1Set = false, Op1Unset = false,
|
|
|
|
Op1Not = false,
|
|
|
|
Op2Set = false, Op2Unset = false,
|
|
|
|
Op2Not = false;
|
|
|
|
|
|
|
|
unsigned Opcode = MachineNode->getMachineOpcode();
|
|
|
|
switch (Opcode) {
|
|
|
|
default: break;
|
|
|
|
case PPC::CRAND:
|
|
|
|
case PPC::CRNAND:
|
|
|
|
case PPC::CROR:
|
|
|
|
case PPC::CRXOR:
|
|
|
|
case PPC::CRNOR:
|
|
|
|
case PPC::CREQV:
|
|
|
|
case PPC::CRANDC:
|
|
|
|
case PPC::CRORC: {
|
|
|
|
SDValue Op = MachineNode->getOperand(1);
|
|
|
|
if (Op.isMachineOpcode()) {
|
|
|
|
if (Op.getMachineOpcode() == PPC::CRSET)
|
|
|
|
Op2Set = true;
|
|
|
|
else if (Op.getMachineOpcode() == PPC::CRUNSET)
|
|
|
|
Op2Unset = true;
|
|
|
|
else if (Op.getMachineOpcode() == PPC::CRNOR &&
|
|
|
|
Op.getOperand(0) == Op.getOperand(1))
|
|
|
|
Op2Not = true;
|
|
|
|
}
|
|
|
|
} // fallthrough
|
|
|
|
case PPC::BC:
|
|
|
|
case PPC::BCn:
|
|
|
|
case PPC::SELECT_I4:
|
|
|
|
case PPC::SELECT_I8:
|
|
|
|
case PPC::SELECT_F4:
|
|
|
|
case PPC::SELECT_F8:
|
[PowerPC] Add support for the QPX vector instruction set
This adds support for the QPX vector instruction set, which is used by the
enhanced A2 cores on the IBM BG/Q supercomputers. QPX vectors are 256 bytes
wide, holding 4 double-precision floating-point values. Boolean values, modeled
here as <4 x i1> are actually also represented as floating-point values
(essentially { -1, 1 } for { false, true }). QPX shares many features with
Altivec and VSX, but is distinct from both of them. One major difference is
that, instead of adding completely-separate vector registers, QPX vector
registers are extensions of the scalar floating-point registers (lane 0 is the
corresponding scalar floating-point value). The operations supported on QPX
vectors mirrors that supported on the scalar floating-point values (with some
additional ones for permutations and logical/comparison operations).
I've been maintaining this support out-of-tree, as part of the bgclang project,
for several years. This is not the entire bgclang patch set, but is most of the
subset that can be cleanly integrated into LLVM proper at this time. Adding
this to the LLVM backend is part of my efforts to rebase bgclang to the current
LLVM trunk, but is independently useful (especially for codes that use LLVM as
a JIT in library form).
The assembler/disassembler test coverage is complete. The CodeGen test coverage
is not, but I've included some tests, and more will be added as follow-up work.
llvm-svn: 230413
2015-02-25 09:06:45 +08:00
|
|
|
case PPC::SELECT_QFRC:
|
|
|
|
case PPC::SELECT_QSRC:
|
|
|
|
case PPC::SELECT_QBRC:
|
2014-10-22 21:13:40 +08:00
|
|
|
case PPC::SELECT_VRRC:
|
2014-10-23 00:58:20 +08:00
|
|
|
case PPC::SELECT_VSFRC:
|
2015-05-08 02:24:05 +08:00
|
|
|
case PPC::SELECT_VSSRC:
|
2014-10-22 21:13:40 +08:00
|
|
|
case PPC::SELECT_VSRC: {
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
SDValue Op = MachineNode->getOperand(0);
|
|
|
|
if (Op.isMachineOpcode()) {
|
|
|
|
if (Op.getMachineOpcode() == PPC::CRSET)
|
|
|
|
Op1Set = true;
|
|
|
|
else if (Op.getMachineOpcode() == PPC::CRUNSET)
|
|
|
|
Op1Unset = true;
|
|
|
|
else if (Op.getMachineOpcode() == PPC::CRNOR &&
|
|
|
|
Op.getOperand(0) == Op.getOperand(1))
|
|
|
|
Op1Not = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-02-28 14:11:16 +08:00
|
|
|
bool SelectSwap = false;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
switch (Opcode) {
|
|
|
|
default: break;
|
|
|
|
case PPC::CRAND:
|
|
|
|
if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
|
|
|
|
// x & x = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Set)
|
|
|
|
// 1 & y = y
|
|
|
|
ResNode = MachineNode->getOperand(1).getNode();
|
|
|
|
else if (Op2Set)
|
|
|
|
// x & 1 = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Unset || Op2Unset)
|
|
|
|
// x & 0 = 0 & y = 0
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Not)
|
|
|
|
// ~x & y = andc(y, x)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(0).
|
|
|
|
getOperand(0));
|
|
|
|
else if (Op2Not)
|
|
|
|
// x & ~y = andc(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1).
|
|
|
|
getOperand(0));
|
2016-02-19 06:09:30 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode)) {
|
2014-02-28 14:11:16 +08:00
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
2016-02-19 06:09:30 +08:00
|
|
|
MachineNode->getOperand(1));
|
2014-02-28 14:11:16 +08:00
|
|
|
SelectSwap = true;
|
2016-02-19 06:09:30 +08:00
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::CRNAND:
|
|
|
|
if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
|
|
|
|
// nand(x, x) -> nor(x, x)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(0));
|
|
|
|
else if (Op1Set)
|
|
|
|
// nand(1, y) -> nor(y, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Set)
|
|
|
|
// nand(x, 1) -> nor(x, x)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(0));
|
|
|
|
else if (Op1Unset || Op2Unset)
|
|
|
|
// nand(x, 0) = nand(0, y) = 1
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Not)
|
|
|
|
// nand(~x, y) = ~(~x & y) = x | ~y = orc(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Not)
|
|
|
|
// nand(x, ~y) = ~x | y = orc(y, x)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(0));
|
2016-02-19 06:09:30 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode)) {
|
2014-02-28 14:11:16 +08:00
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
2016-02-19 06:09:30 +08:00
|
|
|
MachineNode->getOperand(1));
|
2014-02-28 14:11:16 +08:00
|
|
|
SelectSwap = true;
|
2016-02-19 06:09:30 +08:00
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::CROR:
|
|
|
|
if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
|
|
|
|
// x | x = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Set || Op2Set)
|
|
|
|
// x | 1 = 1 | y = 1
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Unset)
|
|
|
|
// 0 | y = y
|
|
|
|
ResNode = MachineNode->getOperand(1).getNode();
|
|
|
|
else if (Op2Unset)
|
|
|
|
// x | 0 = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Not)
|
|
|
|
// ~x | y = orc(y, x)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(0).
|
|
|
|
getOperand(0));
|
|
|
|
else if (Op2Not)
|
|
|
|
// x | ~y = orc(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1).
|
|
|
|
getOperand(0));
|
2016-02-19 06:09:30 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode)) {
|
2014-02-28 14:11:16 +08:00
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
2016-02-19 06:09:30 +08:00
|
|
|
MachineNode->getOperand(1));
|
2014-02-28 14:11:16 +08:00
|
|
|
SelectSwap = true;
|
2016-02-19 06:09:30 +08:00
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::CRXOR:
|
|
|
|
if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
|
|
|
|
// xor(x, x) = 0
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Set)
|
|
|
|
// xor(1, y) -> nor(y, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Set)
|
|
|
|
// xor(x, 1) -> nor(x, x)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(0));
|
|
|
|
else if (Op1Unset)
|
|
|
|
// xor(0, y) = y
|
|
|
|
ResNode = MachineNode->getOperand(1).getNode();
|
|
|
|
else if (Op2Unset)
|
|
|
|
// xor(x, 0) = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Not)
|
|
|
|
// xor(~x, y) = eqv(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Not)
|
|
|
|
// xor(x, ~y) = eqv(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1).
|
|
|
|
getOperand(0));
|
2016-02-19 06:09:30 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode)) {
|
2014-02-28 14:11:16 +08:00
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
2016-02-19 06:09:30 +08:00
|
|
|
MachineNode->getOperand(1));
|
2014-02-28 14:11:16 +08:00
|
|
|
SelectSwap = true;
|
2016-02-19 06:09:30 +08:00
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::CRNOR:
|
|
|
|
if (Op1Set || Op2Set)
|
|
|
|
// nor(1, y) -> 0
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Unset)
|
|
|
|
// nor(0, y) = ~y -> nor(y, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Unset)
|
|
|
|
// nor(x, 0) = ~x
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(0));
|
|
|
|
else if (Op1Not)
|
|
|
|
// nor(~x, y) = andc(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Not)
|
|
|
|
// nor(x, ~y) = andc(y, x)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(0));
|
2016-02-19 06:09:30 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode)) {
|
2014-02-28 14:11:16 +08:00
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
2016-02-19 06:09:30 +08:00
|
|
|
MachineNode->getOperand(1));
|
2014-02-28 14:11:16 +08:00
|
|
|
SelectSwap = true;
|
2016-02-19 06:09:30 +08:00
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::CREQV:
|
|
|
|
if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
|
|
|
|
// eqv(x, x) = 1
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Set)
|
|
|
|
// eqv(1, y) = y
|
|
|
|
ResNode = MachineNode->getOperand(1).getNode();
|
|
|
|
else if (Op2Set)
|
|
|
|
// eqv(x, 1) = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Unset)
|
|
|
|
// eqv(0, y) = ~y -> nor(y, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Unset)
|
|
|
|
// eqv(x, 0) = ~x
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(0));
|
|
|
|
else if (Op1Not)
|
|
|
|
// eqv(~x, y) = xor(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Not)
|
|
|
|
// eqv(x, ~y) = xor(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1).
|
|
|
|
getOperand(0));
|
2016-02-19 06:09:30 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode)) {
|
2014-02-28 14:11:16 +08:00
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
2016-02-19 06:09:30 +08:00
|
|
|
MachineNode->getOperand(1));
|
2014-02-28 14:11:16 +08:00
|
|
|
SelectSwap = true;
|
2016-02-19 06:09:30 +08:00
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::CRANDC:
|
|
|
|
if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
|
|
|
|
// andc(x, x) = 0
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Set)
|
|
|
|
// andc(1, y) = ~y
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op1Unset || Op2Set)
|
|
|
|
// andc(0, y) = andc(x, 1) = 0
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op2Unset)
|
|
|
|
// andc(x, 0) = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Not)
|
|
|
|
// andc(~x, y) = ~(x | y) = nor(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Not)
|
|
|
|
// andc(x, ~y) = x & y
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1).
|
|
|
|
getOperand(0));
|
2016-02-19 06:09:30 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode)) {
|
2014-02-28 14:11:16 +08:00
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
2016-02-19 06:09:30 +08:00
|
|
|
MachineNode->getOperand(0));
|
2014-02-28 14:11:16 +08:00
|
|
|
SelectSwap = true;
|
2016-02-19 06:09:30 +08:00
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::CRORC:
|
|
|
|
if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
|
|
|
|
// orc(x, x) = 1
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op1Set || Op2Unset)
|
|
|
|
// orc(1, y) = orc(x, 0) = 1
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
|
|
|
|
MVT::i1);
|
|
|
|
else if (Op2Set)
|
|
|
|
// orc(x, 1) = x
|
|
|
|
ResNode = MachineNode->getOperand(0).getNode();
|
|
|
|
else if (Op1Unset)
|
|
|
|
// orc(0, y) = ~y
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op1Not)
|
|
|
|
// orc(~x, y) = ~(x & y) = nand(x, y)
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
else if (Op2Not)
|
|
|
|
// orc(x, ~y) = x | y
|
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(0),
|
|
|
|
MachineNode->getOperand(1).
|
|
|
|
getOperand(0));
|
2016-02-19 06:09:30 +08:00
|
|
|
else if (AllUsersSelectZero(MachineNode)) {
|
2014-02-28 14:11:16 +08:00
|
|
|
ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
|
|
|
|
MVT::i1, MachineNode->getOperand(1),
|
2016-02-19 06:09:30 +08:00
|
|
|
MachineNode->getOperand(0));
|
2014-02-28 14:11:16 +08:00
|
|
|
SelectSwap = true;
|
2016-02-19 06:09:30 +08:00
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
break;
|
|
|
|
case PPC::SELECT_I4:
|
|
|
|
case PPC::SELECT_I8:
|
|
|
|
case PPC::SELECT_F4:
|
|
|
|
case PPC::SELECT_F8:
|
[PowerPC] Add support for the QPX vector instruction set
This adds support for the QPX vector instruction set, which is used by the
enhanced A2 cores on the IBM BG/Q supercomputers. QPX vectors are 256 bytes
wide, holding 4 double-precision floating-point values. Boolean values, modeled
here as <4 x i1> are actually also represented as floating-point values
(essentially { -1, 1 } for { false, true }). QPX shares many features with
Altivec and VSX, but is distinct from both of them. One major difference is
that, instead of adding completely-separate vector registers, QPX vector
registers are extensions of the scalar floating-point registers (lane 0 is the
corresponding scalar floating-point value). The operations supported on QPX
vectors mirrors that supported on the scalar floating-point values (with some
additional ones for permutations and logical/comparison operations).
I've been maintaining this support out-of-tree, as part of the bgclang project,
for several years. This is not the entire bgclang patch set, but is most of the
subset that can be cleanly integrated into LLVM proper at this time. Adding
this to the LLVM backend is part of my efforts to rebase bgclang to the current
LLVM trunk, but is independently useful (especially for codes that use LLVM as
a JIT in library form).
The assembler/disassembler test coverage is complete. The CodeGen test coverage
is not, but I've included some tests, and more will be added as follow-up work.
llvm-svn: 230413
2015-02-25 09:06:45 +08:00
|
|
|
case PPC::SELECT_QFRC:
|
|
|
|
case PPC::SELECT_QSRC:
|
|
|
|
case PPC::SELECT_QBRC:
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
case PPC::SELECT_VRRC:
|
2014-10-23 00:58:20 +08:00
|
|
|
case PPC::SELECT_VSFRC:
|
2015-05-08 02:24:05 +08:00
|
|
|
case PPC::SELECT_VSSRC:
|
2014-10-22 21:13:40 +08:00
|
|
|
case PPC::SELECT_VSRC:
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
if (Op1Set)
|
|
|
|
ResNode = MachineNode->getOperand(1).getNode();
|
|
|
|
else if (Op1Unset)
|
|
|
|
ResNode = MachineNode->getOperand(2).getNode();
|
|
|
|
else if (Op1Not)
|
|
|
|
ResNode = CurDAG->getMachineNode(MachineNode->getMachineOpcode(),
|
|
|
|
SDLoc(MachineNode),
|
|
|
|
MachineNode->getValueType(0),
|
|
|
|
MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(2),
|
|
|
|
MachineNode->getOperand(1));
|
|
|
|
break;
|
|
|
|
case PPC::BC:
|
|
|
|
case PPC::BCn:
|
|
|
|
if (Op1Not)
|
|
|
|
ResNode = CurDAG->getMachineNode(Opcode == PPC::BC ? PPC::BCn :
|
|
|
|
PPC::BC,
|
|
|
|
SDLoc(MachineNode),
|
|
|
|
MVT::Other,
|
|
|
|
MachineNode->getOperand(0).
|
|
|
|
getOperand(0),
|
|
|
|
MachineNode->getOperand(1),
|
|
|
|
MachineNode->getOperand(2));
|
|
|
|
// FIXME: Handle Op1Set, Op1Unset here too.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-02-28 14:11:16 +08:00
|
|
|
// If we're inverting this node because it is used only by selects that
|
|
|
|
// we'd like to swap, then swap the selects before the node replacement.
|
|
|
|
if (SelectSwap)
|
|
|
|
SwapAllSelectUsers(MachineNode);
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
if (ResNode != MachineNode) {
|
|
|
|
DEBUG(dbgs() << "CR Peephole replacing:\nOld: ");
|
|
|
|
DEBUG(MachineNode->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\nNew: ");
|
|
|
|
DEBUG(ResNode->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\n");
|
|
|
|
|
|
|
|
ReplaceUses(MachineNode, ResNode);
|
|
|
|
IsModified = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (IsModified)
|
|
|
|
CurDAG->RemoveDeadNodes();
|
|
|
|
} while (IsModified);
|
|
|
|
}
|
|
|
|
|
2014-12-13 07:59:36 +08:00
|
|
|
// Gather the set of 32-bit operations that are known to have their
|
|
|
|
// higher-order 32 bits zero, where ToPromote contains all such operations.
|
|
|
|
static bool PeepholePPC64ZExtGather(SDValue Op32,
|
|
|
|
SmallPtrSetImpl<SDNode *> &ToPromote) {
|
|
|
|
if (!Op32.isMachineOpcode())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// First, check for the "frontier" instructions (those that will clear the
|
|
|
|
// higher-order 32 bits.
|
|
|
|
|
|
|
|
// For RLWINM and RLWNM, we need to make sure that the mask does not wrap
|
|
|
|
// around. If it does not, then these instructions will clear the
|
|
|
|
// higher-order bits.
|
|
|
|
if ((Op32.getMachineOpcode() == PPC::RLWINM ||
|
|
|
|
Op32.getMachineOpcode() == PPC::RLWNM) &&
|
|
|
|
Op32.getConstantOperandVal(2) <= Op32.getConstantOperandVal(3)) {
|
|
|
|
ToPromote.insert(Op32.getNode());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// SLW and SRW always clear the higher-order bits.
|
|
|
|
if (Op32.getMachineOpcode() == PPC::SLW ||
|
|
|
|
Op32.getMachineOpcode() == PPC::SRW) {
|
|
|
|
ToPromote.insert(Op32.getNode());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// For LI and LIS, we need the immediate to be positive (so that it is not
|
|
|
|
// sign extended).
|
|
|
|
if (Op32.getMachineOpcode() == PPC::LI ||
|
|
|
|
Op32.getMachineOpcode() == PPC::LIS) {
|
|
|
|
if (!isUInt<15>(Op32.getConstantOperandVal(0)))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
ToPromote.insert(Op32.getNode());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-01-06 02:09:06 +08:00
|
|
|
// LHBRX and LWBRX always clear the higher-order bits.
|
|
|
|
if (Op32.getMachineOpcode() == PPC::LHBRX ||
|
|
|
|
Op32.getMachineOpcode() == PPC::LWBRX) {
|
|
|
|
ToPromote.insert(Op32.getNode());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-01-06 02:52:29 +08:00
|
|
|
// CNTLZW always produces a 64-bit value in [0,32], and so is zero extended.
|
|
|
|
if (Op32.getMachineOpcode() == PPC::CNTLZW) {
|
|
|
|
ToPromote.insert(Op32.getNode());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-12-13 07:59:36 +08:00
|
|
|
// Next, check for those instructions we can look through.
|
|
|
|
|
|
|
|
// Assuming the mask does not wrap around, then the higher-order bits are
|
|
|
|
// taken directly from the first operand.
|
|
|
|
if (Op32.getMachineOpcode() == PPC::RLWIMI &&
|
|
|
|
Op32.getConstantOperandVal(3) <= Op32.getConstantOperandVal(4)) {
|
|
|
|
SmallPtrSet<SDNode *, 16> ToPromote1;
|
|
|
|
if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
ToPromote.insert(Op32.getNode());
|
|
|
|
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// For OR, the higher-order bits are zero if that is true for both operands.
|
|
|
|
// For SELECT_I4, the same is true (but the relevant operand numbers are
|
|
|
|
// shifted by 1).
|
|
|
|
if (Op32.getMachineOpcode() == PPC::OR ||
|
|
|
|
Op32.getMachineOpcode() == PPC::SELECT_I4) {
|
|
|
|
unsigned B = Op32.getMachineOpcode() == PPC::SELECT_I4 ? 1 : 0;
|
|
|
|
SmallPtrSet<SDNode *, 16> ToPromote1;
|
|
|
|
if (!PeepholePPC64ZExtGather(Op32.getOperand(B+0), ToPromote1))
|
|
|
|
return false;
|
|
|
|
if (!PeepholePPC64ZExtGather(Op32.getOperand(B+1), ToPromote1))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
ToPromote.insert(Op32.getNode());
|
|
|
|
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// For ORI and ORIS, we need the higher-order bits of the first operand to be
|
|
|
|
// zero, and also for the constant to be positive (so that it is not sign
|
|
|
|
// extended).
|
|
|
|
if (Op32.getMachineOpcode() == PPC::ORI ||
|
|
|
|
Op32.getMachineOpcode() == PPC::ORIS) {
|
|
|
|
SmallPtrSet<SDNode *, 16> ToPromote1;
|
|
|
|
if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1))
|
|
|
|
return false;
|
|
|
|
if (!isUInt<15>(Op32.getConstantOperandVal(1)))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
ToPromote.insert(Op32.getNode());
|
|
|
|
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The higher-order bits of AND are zero if that is true for at least one of
|
|
|
|
// the operands.
|
|
|
|
if (Op32.getMachineOpcode() == PPC::AND) {
|
|
|
|
SmallPtrSet<SDNode *, 16> ToPromote1, ToPromote2;
|
|
|
|
bool Op0OK =
|
|
|
|
PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1);
|
|
|
|
bool Op1OK =
|
|
|
|
PeepholePPC64ZExtGather(Op32.getOperand(1), ToPromote2);
|
|
|
|
if (!Op0OK && !Op1OK)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
ToPromote.insert(Op32.getNode());
|
|
|
|
|
|
|
|
if (Op0OK)
|
|
|
|
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
|
|
|
|
|
|
|
|
if (Op1OK)
|
|
|
|
ToPromote.insert(ToPromote2.begin(), ToPromote2.end());
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// For ANDI and ANDIS, the higher-order bits are zero if either that is true
|
|
|
|
// of the first operand, or if the second operand is positive (so that it is
|
|
|
|
// not sign extended).
|
|
|
|
if (Op32.getMachineOpcode() == PPC::ANDIo ||
|
|
|
|
Op32.getMachineOpcode() == PPC::ANDISo) {
|
|
|
|
SmallPtrSet<SDNode *, 16> ToPromote1;
|
|
|
|
bool Op0OK =
|
|
|
|
PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1);
|
|
|
|
bool Op1OK = isUInt<15>(Op32.getConstantOperandVal(1));
|
|
|
|
if (!Op0OK && !Op1OK)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
ToPromote.insert(Op32.getNode());
|
|
|
|
|
|
|
|
if (Op0OK)
|
|
|
|
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void PPCDAGToDAGISel::PeepholePPC64ZExt() {
|
|
|
|
if (!PPCSubTarget->isPPC64())
|
|
|
|
return;
|
|
|
|
|
|
|
|
// When we zero-extend from i32 to i64, we use a pattern like this:
|
|
|
|
// def : Pat<(i64 (zext i32:$in)),
|
|
|
|
// (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32),
|
|
|
|
// 0, 32)>;
|
|
|
|
// There are several 32-bit shift/rotate instructions, however, that will
|
|
|
|
// clear the higher-order bits of their output, rendering the RLDICL
|
|
|
|
// unnecessary. When that happens, we remove it here, and redefine the
|
|
|
|
// relevant 32-bit operation to be a 64-bit operation.
|
|
|
|
|
|
|
|
SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
|
|
|
|
++Position;
|
|
|
|
|
|
|
|
bool MadeChange = false;
|
|
|
|
while (Position != CurDAG->allnodes_begin()) {
|
2015-10-20 09:07:37 +08:00
|
|
|
SDNode *N = &*--Position;
|
2014-12-13 07:59:36 +08:00
|
|
|
// Skip dead nodes and any non-machine opcodes.
|
|
|
|
if (N->use_empty() || !N->isMachineOpcode())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (N->getMachineOpcode() != PPC::RLDICL)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (N->getConstantOperandVal(1) != 0 ||
|
|
|
|
N->getConstantOperandVal(2) != 32)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
SDValue ISR = N->getOperand(0);
|
|
|
|
if (!ISR.isMachineOpcode() ||
|
|
|
|
ISR.getMachineOpcode() != TargetOpcode::INSERT_SUBREG)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!ISR.hasOneUse())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (ISR.getConstantOperandVal(2) != PPC::sub_32)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
SDValue IDef = ISR.getOperand(0);
|
|
|
|
if (!IDef.isMachineOpcode() ||
|
|
|
|
IDef.getMachineOpcode() != TargetOpcode::IMPLICIT_DEF)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// We now know that we're looking at a canonical i32 -> i64 zext. See if we
|
|
|
|
// can get rid of it.
|
|
|
|
|
|
|
|
SDValue Op32 = ISR->getOperand(1);
|
|
|
|
if (!Op32.isMachineOpcode())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// There are some 32-bit instructions that always clear the high-order 32
|
|
|
|
// bits, there are also some instructions (like AND) that we can look
|
|
|
|
// through.
|
|
|
|
SmallPtrSet<SDNode *, 16> ToPromote;
|
|
|
|
if (!PeepholePPC64ZExtGather(Op32, ToPromote))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// If the ToPromote set contains nodes that have uses outside of the set
|
|
|
|
// (except for the original INSERT_SUBREG), then abort the transformation.
|
|
|
|
bool OutsideUse = false;
|
|
|
|
for (SDNode *PN : ToPromote) {
|
|
|
|
for (SDNode *UN : PN->uses()) {
|
|
|
|
if (!ToPromote.count(UN) && UN != ISR.getNode()) {
|
|
|
|
OutsideUse = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (OutsideUse)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (OutsideUse)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
MadeChange = true;
|
|
|
|
|
|
|
|
// We now know that this zero extension can be removed by promoting to
|
|
|
|
// nodes in ToPromote to 64-bit operations, where for operations in the
|
|
|
|
// frontier of the set, we need to insert INSERT_SUBREGs for their
|
|
|
|
// operands.
|
|
|
|
for (SDNode *PN : ToPromote) {
|
|
|
|
unsigned NewOpcode;
|
|
|
|
switch (PN->getMachineOpcode()) {
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Don't know the 64-bit variant of this instruction");
|
|
|
|
case PPC::RLWINM: NewOpcode = PPC::RLWINM8; break;
|
|
|
|
case PPC::RLWNM: NewOpcode = PPC::RLWNM8; break;
|
|
|
|
case PPC::SLW: NewOpcode = PPC::SLW8; break;
|
|
|
|
case PPC::SRW: NewOpcode = PPC::SRW8; break;
|
|
|
|
case PPC::LI: NewOpcode = PPC::LI8; break;
|
|
|
|
case PPC::LIS: NewOpcode = PPC::LIS8; break;
|
2015-01-06 02:09:06 +08:00
|
|
|
case PPC::LHBRX: NewOpcode = PPC::LHBRX8; break;
|
|
|
|
case PPC::LWBRX: NewOpcode = PPC::LWBRX8; break;
|
2015-01-06 02:52:29 +08:00
|
|
|
case PPC::CNTLZW: NewOpcode = PPC::CNTLZW8; break;
|
2014-12-13 07:59:36 +08:00
|
|
|
case PPC::RLWIMI: NewOpcode = PPC::RLWIMI8; break;
|
|
|
|
case PPC::OR: NewOpcode = PPC::OR8; break;
|
|
|
|
case PPC::SELECT_I4: NewOpcode = PPC::SELECT_I8; break;
|
|
|
|
case PPC::ORI: NewOpcode = PPC::ORI8; break;
|
|
|
|
case PPC::ORIS: NewOpcode = PPC::ORIS8; break;
|
|
|
|
case PPC::AND: NewOpcode = PPC::AND8; break;
|
|
|
|
case PPC::ANDIo: NewOpcode = PPC::ANDIo8; break;
|
|
|
|
case PPC::ANDISo: NewOpcode = PPC::ANDISo8; break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Note: During the replacement process, the nodes will be in an
|
|
|
|
// inconsistent state (some instructions will have operands with values
|
|
|
|
// of the wrong type). Once done, however, everything should be right
|
|
|
|
// again.
|
|
|
|
|
|
|
|
SmallVector<SDValue, 4> Ops;
|
|
|
|
for (const SDValue &V : PN->ops()) {
|
|
|
|
if (!ToPromote.count(V.getNode()) && V.getValueType() == MVT::i32 &&
|
|
|
|
!isa<ConstantSDNode>(V)) {
|
|
|
|
SDValue ReplOpOps[] = { ISR.getOperand(0), V, ISR.getOperand(2) };
|
|
|
|
SDNode *ReplOp =
|
|
|
|
CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(V),
|
|
|
|
ISR.getNode()->getVTList(), ReplOpOps);
|
|
|
|
Ops.push_back(SDValue(ReplOp, 0));
|
|
|
|
} else {
|
|
|
|
Ops.push_back(V);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Because all to-be-promoted nodes only have users that are other
|
|
|
|
// promoted nodes (or the original INSERT_SUBREG), we can safely replace
|
|
|
|
// the i32 result value type with i64.
|
|
|
|
|
|
|
|
SmallVector<EVT, 2> NewVTs;
|
|
|
|
SDVTList VTs = PN->getVTList();
|
|
|
|
for (unsigned i = 0, ie = VTs.NumVTs; i != ie; ++i)
|
|
|
|
if (VTs.VTs[i] == MVT::i32)
|
|
|
|
NewVTs.push_back(MVT::i64);
|
|
|
|
else
|
|
|
|
NewVTs.push_back(VTs.VTs[i]);
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "PPC64 ZExt Peephole morphing:\nOld: ");
|
|
|
|
DEBUG(PN->dump(CurDAG));
|
|
|
|
|
|
|
|
CurDAG->SelectNodeTo(PN, NewOpcode, CurDAG->getVTList(NewVTs), Ops);
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "\nNew: ");
|
|
|
|
DEBUG(PN->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now we replace the original zero extend and its associated INSERT_SUBREG
|
|
|
|
// with the value feeding the INSERT_SUBREG (which has now been promoted to
|
|
|
|
// return an i64).
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "PPC64 ZExt Peephole replacing:\nOld: ");
|
|
|
|
DEBUG(N->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\nNew: ");
|
|
|
|
DEBUG(Op32.getNode()->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\n");
|
|
|
|
|
|
|
|
ReplaceUses(N, Op32.getNode());
|
|
|
|
}
|
|
|
|
|
|
|
|
if (MadeChange)
|
|
|
|
CurDAG->RemoveDeadNodes();
|
|
|
|
}
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
void PPCDAGToDAGISel::PeepholePPC64() {
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
// These optimizations are currently supported only for 64-bit SVR4.
|
2014-05-22 09:07:24 +08:00
|
|
|
if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64())
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
|
|
|
|
++Position;
|
|
|
|
|
|
|
|
while (Position != CurDAG->allnodes_begin()) {
|
2015-10-20 09:07:37 +08:00
|
|
|
SDNode *N = &*--Position;
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
// Skip dead nodes and any non-machine opcodes.
|
|
|
|
if (N->use_empty() || !N->isMachineOpcode())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
unsigned FirstOp;
|
|
|
|
unsigned StorageOpcode = N->getMachineOpcode();
|
|
|
|
|
|
|
|
switch (StorageOpcode) {
|
|
|
|
default: continue;
|
|
|
|
|
|
|
|
case PPC::LBZ:
|
|
|
|
case PPC::LBZ8:
|
|
|
|
case PPC::LD:
|
|
|
|
case PPC::LFD:
|
|
|
|
case PPC::LFS:
|
|
|
|
case PPC::LHA:
|
|
|
|
case PPC::LHA8:
|
|
|
|
case PPC::LHZ:
|
|
|
|
case PPC::LHZ8:
|
|
|
|
case PPC::LWA:
|
|
|
|
case PPC::LWZ:
|
|
|
|
case PPC::LWZ8:
|
|
|
|
FirstOp = 0;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case PPC::STB:
|
|
|
|
case PPC::STB8:
|
|
|
|
case PPC::STD:
|
|
|
|
case PPC::STFD:
|
|
|
|
case PPC::STFS:
|
|
|
|
case PPC::STH:
|
|
|
|
case PPC::STH8:
|
|
|
|
case PPC::STW:
|
|
|
|
case PPC::STW8:
|
|
|
|
FirstOp = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
[PPC]: Peephole optimize small accesss to aligned globals.
Access to aligned globals gives us a chance to peephole optimize nonzero
offsets. If a struct is 4 byte aligned, then accesses to bytes 0-3 won't
overflow the available displacement. For example:
addis 3, 2, b4v@toc@ha
addi 4, 3, b4v@toc@l
lbz 5, b4v@toc@l(3) ; This is the result of the current peephole
lbz 6, 1(4) ; optimizer
lbz 7, 2(4)
lbz 8, 3(4)
If b4v is 4-byte aligned, we can skip using register 4 because we know
that b4v@toc@l+{1,2,3} won't overflow 32K, and instead generate:
addis 3, 2, b4v@toc@ha
lbz 4, b4v@toc@l(3)
lbz 5, b4v@toc@l+1(3)
lbz 6, b4v@toc@l+2(3)
lbz 7, b4v@toc@l+3(3)
Saving a register and an addition.
Larger alignments allow larger structures/arrays to be optimized.
llvm-svn: 255319
2015-12-11 08:47:36 +08:00
|
|
|
// If this is a load or store with a zero offset, or within the alignment,
|
|
|
|
// we may be able to fold an add-immediate into the memory operation.
|
|
|
|
// The check against alignment is below, as it can't occur until we check
|
|
|
|
// the arguments to N
|
|
|
|
if (!isa<ConstantSDNode>(N->getOperand(FirstOp)))
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
SDValue Base = N->getOperand(FirstOp + 1);
|
|
|
|
if (!Base.isMachineOpcode())
|
|
|
|
continue;
|
|
|
|
|
[PPC]: Peephole optimize small accesss to aligned globals.
Access to aligned globals gives us a chance to peephole optimize nonzero
offsets. If a struct is 4 byte aligned, then accesses to bytes 0-3 won't
overflow the available displacement. For example:
addis 3, 2, b4v@toc@ha
addi 4, 3, b4v@toc@l
lbz 5, b4v@toc@l(3) ; This is the result of the current peephole
lbz 6, 1(4) ; optimizer
lbz 7, 2(4)
lbz 8, 3(4)
If b4v is 4-byte aligned, we can skip using register 4 because we know
that b4v@toc@l+{1,2,3} won't overflow 32K, and instead generate:
addis 3, 2, b4v@toc@ha
lbz 4, b4v@toc@l(3)
lbz 5, b4v@toc@l+1(3)
lbz 6, b4v@toc@l+2(3)
lbz 7, b4v@toc@l+3(3)
Saving a register and an addition.
Larger alignments allow larger structures/arrays to be optimized.
llvm-svn: 255319
2015-12-11 08:47:36 +08:00
|
|
|
// On targets with fusion, we don't want this to fire and remove a fusion
|
|
|
|
// opportunity, unless a) it results in another fusion opportunity or
|
|
|
|
// b) optimizing for size.
|
|
|
|
if (PPCSubTarget->hasFusion() &&
|
2015-12-11 08:58:32 +08:00
|
|
|
(!MF->getFunction()->optForSize() && !Base.hasOneUse()))
|
[PPC]: Peephole optimize small accesss to aligned globals.
Access to aligned globals gives us a chance to peephole optimize nonzero
offsets. If a struct is 4 byte aligned, then accesses to bytes 0-3 won't
overflow the available displacement. For example:
addis 3, 2, b4v@toc@ha
addi 4, 3, b4v@toc@l
lbz 5, b4v@toc@l(3) ; This is the result of the current peephole
lbz 6, 1(4) ; optimizer
lbz 7, 2(4)
lbz 8, 3(4)
If b4v is 4-byte aligned, we can skip using register 4 because we know
that b4v@toc@l+{1,2,3} won't overflow 32K, and instead generate:
addis 3, 2, b4v@toc@ha
lbz 4, b4v@toc@l(3)
lbz 5, b4v@toc@l+1(3)
lbz 6, b4v@toc@l+2(3)
lbz 7, b4v@toc@l+3(3)
Saving a register and an addition.
Larger alignments allow larger structures/arrays to be optimized.
llvm-svn: 255319
2015-12-11 08:47:36 +08:00
|
|
|
continue;
|
|
|
|
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
unsigned Flags = 0;
|
|
|
|
bool ReplaceFlags = true;
|
|
|
|
|
|
|
|
// When the feeding operation is an add-immediate of some sort,
|
|
|
|
// determine whether we need to add relocation information to the
|
|
|
|
// target flags on the immediate operand when we fold it into the
|
|
|
|
// load instruction.
|
|
|
|
//
|
|
|
|
// For something like ADDItocL, the relocation information is
|
|
|
|
// inferred from the opcode; when we process it in the AsmPrinter,
|
|
|
|
// we add the necessary relocation there. A load, though, can receive
|
|
|
|
// relocation from various flavors of ADDIxxx, so we need to carry
|
|
|
|
// the relocation information in the target flags.
|
|
|
|
switch (Base.getMachineOpcode()) {
|
|
|
|
default: continue;
|
|
|
|
|
|
|
|
case PPC::ADDI8:
|
2013-03-26 18:55:20 +08:00
|
|
|
case PPC::ADDI:
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
// In some cases (such as TLS) the relocation information
|
|
|
|
// is already in place on the operand, so copying the operand
|
|
|
|
// is sufficient.
|
|
|
|
ReplaceFlags = false;
|
|
|
|
// For these cases, the immediate may not be divisible by 4, in
|
|
|
|
// which case the fold is illegal for DS-form instructions. (The
|
|
|
|
// other cases provide aligned addresses and are always safe.)
|
|
|
|
if ((StorageOpcode == PPC::LWA ||
|
|
|
|
StorageOpcode == PPC::LD ||
|
|
|
|
StorageOpcode == PPC::STD) &&
|
|
|
|
(!isa<ConstantSDNode>(Base.getOperand(1)) ||
|
|
|
|
Base.getConstantOperandVal(1) % 4 != 0))
|
|
|
|
continue;
|
|
|
|
break;
|
|
|
|
case PPC::ADDIdtprelL:
|
2013-06-21 22:42:20 +08:00
|
|
|
Flags = PPCII::MO_DTPREL_LO;
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
break;
|
|
|
|
case PPC::ADDItlsldL:
|
2013-06-21 22:42:20 +08:00
|
|
|
Flags = PPCII::MO_TLSLD_LO;
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
break;
|
|
|
|
case PPC::ADDItocL:
|
2013-06-21 22:42:20 +08:00
|
|
|
Flags = PPCII::MO_TOC_LO;
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
[PPC]: Peephole optimize small accesss to aligned globals.
Access to aligned globals gives us a chance to peephole optimize nonzero
offsets. If a struct is 4 byte aligned, then accesses to bytes 0-3 won't
overflow the available displacement. For example:
addis 3, 2, b4v@toc@ha
addi 4, 3, b4v@toc@l
lbz 5, b4v@toc@l(3) ; This is the result of the current peephole
lbz 6, 1(4) ; optimizer
lbz 7, 2(4)
lbz 8, 3(4)
If b4v is 4-byte aligned, we can skip using register 4 because we know
that b4v@toc@l+{1,2,3} won't overflow 32K, and instead generate:
addis 3, 2, b4v@toc@ha
lbz 4, b4v@toc@l(3)
lbz 5, b4v@toc@l+1(3)
lbz 6, b4v@toc@l+2(3)
lbz 7, b4v@toc@l+3(3)
Saving a register and an addition.
Larger alignments allow larger structures/arrays to be optimized.
llvm-svn: 255319
2015-12-11 08:47:36 +08:00
|
|
|
SDValue ImmOpnd = Base.getOperand(1);
|
|
|
|
int MaxDisplacement = 0;
|
|
|
|
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
|
|
|
|
const GlobalValue *GV = GA->getGlobal();
|
|
|
|
MaxDisplacement = GV->getAlignment() - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int Offset = N->getConstantOperandVal(FirstOp);
|
|
|
|
if (Offset < 0 || Offset > MaxDisplacement)
|
|
|
|
continue;
|
|
|
|
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
// We found an opportunity. Reverse the operands from the add
|
|
|
|
// immediate and substitute them into the load or store. If
|
|
|
|
// needed, update the target flags for the immediate operand to
|
|
|
|
// reflect the necessary relocation information.
|
|
|
|
DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: ");
|
|
|
|
DEBUG(Base->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\nN: ");
|
|
|
|
DEBUG(N->dump(CurDAG));
|
|
|
|
DEBUG(dbgs() << "\n");
|
|
|
|
|
|
|
|
// If the relocation information isn't already present on the
|
|
|
|
// immediate operand, add it now.
|
|
|
|
if (ReplaceFlags) {
|
2013-02-21 22:35:42 +08:00
|
|
|
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(GA);
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
const GlobalValue *GV = GA->getGlobal();
|
Index: test/CodeGen/PowerPC/reloc-align.ll
===================================================================
--- test/CodeGen/PowerPC/reloc-align.ll (revision 0)
+++ test/CodeGen/PowerPC/reloc-align.ll (revision 0)
@@ -0,0 +1,34 @@
+; RUN: llc -mcpu=pwr7 -O1 < %s | FileCheck %s
+
+; This test verifies that the peephole optimization of address accesses
+; does not produce a load or store with a relocation that can't be
+; satisfied for a given instruction encoding. Reduced from a test supplied
+; by Hal Finkel.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S1 = type { [8 x i8] }
+
+@main.l_1554 = internal global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 -1, i8 -6, i8 57, i8 62, i8 -48, i8 0, i8 58, i8 80 }, align 1
+
+; Function Attrs: nounwind readonly
+define signext i32 @main() #0 {
+entry:
+ %call = tail call fastcc signext i32 @func_90(%struct.S1* byval bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @main.l_1554 to %struct.S1*))
+; CHECK-NOT: ld {{[0-9]+}}, main.l_1554@toc@l
+ ret i32 %call
+}
+
+; Function Attrs: nounwind readonly
+define internal fastcc signext i32 @func_90(%struct.S1* byval nocapture %p_91) #0 {
+entry:
+ %0 = bitcast %struct.S1* %p_91 to i64*
+ %bf.load = load i64* %0, align 1
+ %bf.shl = shl i64 %bf.load, 26
+ %bf.ashr = ashr i64 %bf.shl, 54
+ %bf.cast = trunc i64 %bf.ashr to i32
+ ret i32 %bf.cast
+}
+
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: lib/Target/PowerPC/PPCAsmPrinter.cpp
===================================================================
--- lib/Target/PowerPC/PPCAsmPrinter.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCAsmPrinter.cpp (working copy)
@@ -679,7 +679,26 @@ void PPCAsmPrinter::EmitInstruction(const MachineI
OutStreamer.EmitRawText(StringRef("\tmsync"));
return;
}
+ break;
+ case PPC::LD:
+ case PPC::STD:
+ case PPC::LWA: {
+ // Verify alignment is legal, so we don't create relocations
+ // that can't be supported.
+ // FIXME: This test is currently disabled for Darwin. The test
+ // suite shows a handful of test cases that fail this check for
+ // Darwin. Those need to be investigated before this sanity test
+ // can be enabled for those subtargets.
+ if (!Subtarget.isDarwin()) {
+ unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
+ llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
+ }
+ // Now process the instruction normally.
+ break;
}
+ }
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
OutStreamer.EmitInstruction(TmpInst);
Index: lib/Target/PowerPC/PPCISelDAGToDAG.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelDAGToDAG.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCISelDAGToDAG.cpp (working copy)
@@ -1530,6 +1530,14 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
SDLoc dl(GA);
const GlobalValue *GV = GA->getGlobal();
+ // We can't perform this optimization for data whose alignment
+ // is insufficient for the instruction encoding.
+ if (GV->getAlignment() < 4 &&
+ (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
+ StorageOpcode == PPC::LWA)) {
+ DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
+ continue;
+ }
ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
} else if (ConstantPoolSDNode *CP =
dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
llvm-svn: 185380
2013-07-02 04:52:27 +08:00
|
|
|
// We can't perform this optimization for data whose alignment
|
|
|
|
// is insufficient for the instruction encoding.
|
|
|
|
if (GV->getAlignment() < 4 &&
|
|
|
|
(StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
|
[PPC]: Peephole optimize small accesss to aligned globals.
Access to aligned globals gives us a chance to peephole optimize nonzero
offsets. If a struct is 4 byte aligned, then accesses to bytes 0-3 won't
overflow the available displacement. For example:
addis 3, 2, b4v@toc@ha
addi 4, 3, b4v@toc@l
lbz 5, b4v@toc@l(3) ; This is the result of the current peephole
lbz 6, 1(4) ; optimizer
lbz 7, 2(4)
lbz 8, 3(4)
If b4v is 4-byte aligned, we can skip using register 4 because we know
that b4v@toc@l+{1,2,3} won't overflow 32K, and instead generate:
addis 3, 2, b4v@toc@ha
lbz 4, b4v@toc@l(3)
lbz 5, b4v@toc@l+1(3)
lbz 6, b4v@toc@l+2(3)
lbz 7, b4v@toc@l+3(3)
Saving a register and an addition.
Larger alignments allow larger structures/arrays to be optimized.
llvm-svn: 255319
2015-12-11 08:47:36 +08:00
|
|
|
StorageOpcode == PPC::LWA || (Offset % 4) != 0)) {
|
Index: test/CodeGen/PowerPC/reloc-align.ll
===================================================================
--- test/CodeGen/PowerPC/reloc-align.ll (revision 0)
+++ test/CodeGen/PowerPC/reloc-align.ll (revision 0)
@@ -0,0 +1,34 @@
+; RUN: llc -mcpu=pwr7 -O1 < %s | FileCheck %s
+
+; This test verifies that the peephole optimization of address accesses
+; does not produce a load or store with a relocation that can't be
+; satisfied for a given instruction encoding. Reduced from a test supplied
+; by Hal Finkel.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S1 = type { [8 x i8] }
+
+@main.l_1554 = internal global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 -1, i8 -6, i8 57, i8 62, i8 -48, i8 0, i8 58, i8 80 }, align 1
+
+; Function Attrs: nounwind readonly
+define signext i32 @main() #0 {
+entry:
+ %call = tail call fastcc signext i32 @func_90(%struct.S1* byval bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @main.l_1554 to %struct.S1*))
+; CHECK-NOT: ld {{[0-9]+}}, main.l_1554@toc@l
+ ret i32 %call
+}
+
+; Function Attrs: nounwind readonly
+define internal fastcc signext i32 @func_90(%struct.S1* byval nocapture %p_91) #0 {
+entry:
+ %0 = bitcast %struct.S1* %p_91 to i64*
+ %bf.load = load i64* %0, align 1
+ %bf.shl = shl i64 %bf.load, 26
+ %bf.ashr = ashr i64 %bf.shl, 54
+ %bf.cast = trunc i64 %bf.ashr to i32
+ ret i32 %bf.cast
+}
+
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: lib/Target/PowerPC/PPCAsmPrinter.cpp
===================================================================
--- lib/Target/PowerPC/PPCAsmPrinter.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCAsmPrinter.cpp (working copy)
@@ -679,7 +679,26 @@ void PPCAsmPrinter::EmitInstruction(const MachineI
OutStreamer.EmitRawText(StringRef("\tmsync"));
return;
}
+ break;
+ case PPC::LD:
+ case PPC::STD:
+ case PPC::LWA: {
+ // Verify alignment is legal, so we don't create relocations
+ // that can't be supported.
+ // FIXME: This test is currently disabled for Darwin. The test
+ // suite shows a handful of test cases that fail this check for
+ // Darwin. Those need to be investigated before this sanity test
+ // can be enabled for those subtargets.
+ if (!Subtarget.isDarwin()) {
+ unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
+ llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
+ }
+ // Now process the instruction normally.
+ break;
}
+ }
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
OutStreamer.EmitInstruction(TmpInst);
Index: lib/Target/PowerPC/PPCISelDAGToDAG.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelDAGToDAG.cpp (revision 185327)
+++ lib/Target/PowerPC/PPCISelDAGToDAG.cpp (working copy)
@@ -1530,6 +1530,14 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
SDLoc dl(GA);
const GlobalValue *GV = GA->getGlobal();
+ // We can't perform this optimization for data whose alignment
+ // is insufficient for the instruction encoding.
+ if (GV->getAlignment() < 4 &&
+ (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
+ StorageOpcode == PPC::LWA)) {
+ DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
+ continue;
+ }
ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
} else if (ConstantPoolSDNode *CP =
dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
llvm-svn: 185380
2013-07-02 04:52:27 +08:00
|
|
|
DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
|
|
|
|
continue;
|
|
|
|
}
|
[PPC]: Peephole optimize small accesss to aligned globals.
Access to aligned globals gives us a chance to peephole optimize nonzero
offsets. If a struct is 4 byte aligned, then accesses to bytes 0-3 won't
overflow the available displacement. For example:
addis 3, 2, b4v@toc@ha
addi 4, 3, b4v@toc@l
lbz 5, b4v@toc@l(3) ; This is the result of the current peephole
lbz 6, 1(4) ; optimizer
lbz 7, 2(4)
lbz 8, 3(4)
If b4v is 4-byte aligned, we can skip using register 4 because we know
that b4v@toc@l+{1,2,3} won't overflow 32K, and instead generate:
addis 3, 2, b4v@toc@ha
lbz 4, b4v@toc@l(3)
lbz 5, b4v@toc@l+1(3)
lbz 6, b4v@toc@l+2(3)
lbz 7, b4v@toc@l+3(3)
Saving a register and an addition.
Larger alignments allow larger structures/arrays to be optimized.
llvm-svn: 255319
2015-12-11 08:47:36 +08:00
|
|
|
ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, Offset, Flags);
|
2013-02-22 01:26:05 +08:00
|
|
|
} else if (ConstantPoolSDNode *CP =
|
|
|
|
dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
|
2013-02-21 22:35:42 +08:00
|
|
|
const Constant *C = CP->getConstVal();
|
|
|
|
ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64,
|
|
|
|
CP->getAlignment(),
|
[PPC]: Peephole optimize small accesss to aligned globals.
Access to aligned globals gives us a chance to peephole optimize nonzero
offsets. If a struct is 4 byte aligned, then accesses to bytes 0-3 won't
overflow the available displacement. For example:
addis 3, 2, b4v@toc@ha
addi 4, 3, b4v@toc@l
lbz 5, b4v@toc@l(3) ; This is the result of the current peephole
lbz 6, 1(4) ; optimizer
lbz 7, 2(4)
lbz 8, 3(4)
If b4v is 4-byte aligned, we can skip using register 4 because we know
that b4v@toc@l+{1,2,3} won't overflow 32K, and instead generate:
addis 3, 2, b4v@toc@ha
lbz 4, b4v@toc@l(3)
lbz 5, b4v@toc@l+1(3)
lbz 6, b4v@toc@l+2(3)
lbz 7, b4v@toc@l+3(3)
Saving a register and an addition.
Larger alignments allow larger structures/arrays to be optimized.
llvm-svn: 255319
2015-12-11 08:47:36 +08:00
|
|
|
Offset, Flags);
|
PPCDAGToDAGISel::PostprocessISelDAG()
This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.
One optimization is implemented here: folds to clean up complex
addressing expressions for thread-local storage and medium code
model. It will also be useful for large code model sequences when
those are added later. I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point. DCE is
trivial on the DAG representation.
A typical example of a candidate code sequence in assembly:
addis 3, 2, globalvar@toc@ha
addi 3, 3, globalvar@toc@l
lwz 5, 0(3)
When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:
addis 3, 2, globalvar@toc@ha
lwz 5, globalvar@toc@l(3)
Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.
llvm-svn: 175697
2013-02-21 08:38:25 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (FirstOp == 1) // Store
|
|
|
|
(void)CurDAG->UpdateNodeOperands(N, N->getOperand(0), ImmOpnd,
|
|
|
|
Base.getOperand(0), N->getOperand(3));
|
|
|
|
else // Load
|
|
|
|
(void)CurDAG->UpdateNodeOperands(N, ImmOpnd, Base.getOperand(0),
|
|
|
|
N->getOperand(2));
|
|
|
|
|
|
|
|
// The add-immediate may now be dead, in which case remove it.
|
|
|
|
if (Base.getNode()->use_empty())
|
|
|
|
CurDAG->RemoveDeadNode(Base.getNode());
|
|
|
|
}
|
|
|
|
}
|
2005-08-18 03:33:03 +08:00
|
|
|
|
2006-06-10 09:15:02 +08:00
|
|
|
|
2010-12-24 12:28:06 +08:00
|
|
|
/// createPPCISelDag - This pass converts a legalized DAG into a
|
2005-08-18 03:33:03 +08:00
|
|
|
/// PowerPC-specific DAG, ready for instruction scheduling.
|
|
|
|
///
|
2006-03-14 07:20:37 +08:00
|
|
|
FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
|
2005-10-18 08:28:58 +08:00
|
|
|
return new PPCDAGToDAGISel(TM);
|
2005-08-18 03:33:03 +08:00
|
|
|
}
|